### Creating Transdiagnostic Patients and Psychiatric Control Groups with UKBB Data

In [1]:
# Import packages
import pandas as pd
import numpy as np

In [2]:
# Define files
dfile = 'current.csv' #Full dataset
dx_fields = 'dx_fields.csv' #Includes All FieldIDs used for group determination

In [3]:
variables = pd.read_csv(dfile, dtype=str,nrows=0)
header = variables.columns.tolist()

ls = []
varlist = pd.read_csv(dx_fields)  # Read the CSV containing column identifiers
varlist = varlist['FinalFieldID'].tolist()  # Convert the column to a list of strings

In [4]:
# Iterate over each item in the list of column identifiers
for v in varlist:
    # Check if the string contains a dot using Python's string method
    if '.' in v:
        var = v.split('.')  # Split the string on the dot
        # Get headers from `header` that start with the first part of `var`
        varheader = [i for i in header if i.startswith(var[0])]
    else:
        varheader = [v]  # If no dot, just keep the string as is
    ls.append(varheader)  # Append the result to the list

# Flatten the list of lists into a single list
vars = [item for sublist in ls for item in sublist]

In [None]:
# Read the main CSV file, using only the relevant columns
df = pd.read_csv(dfile, dtype=str, index_col='eid', usecols=vars + ['eid'])

In [None]:
print(df.columns.get_loc('41270-0.0'))
print(df.columns.get_loc('41270-0.258'))

In [11]:
dx = df.iloc[:,35:294] #columns from above printed outputs

In [None]:
# Whole-Sample Exclusion Diagnoses

# Neurological Conditions
df.drop(df[dx.apply(lambda row: row.str.startswith('G').any(), axis=1)].index, inplace=True)

# Organic Mental Disorders (added later but had removed pps with F0)
df.drop(df[dx.apply(lambda row: row.str.startswith('F0').any(), axis=1)].index, inplace=True)

# Mental Retardation
df.drop(df[dx.apply(lambda row: row.str.startswith('F7').any(), axis=1)].index, inplace=True)

In [None]:
# Remove Error Code Dates    
dates_to_replace = ['1900-01-01', '1901-01-01', '1902-02-02', '1903-03-03','1090-09-09', '2037-07-07']
df.replace(dates_to_replace, np.nan, inplace=True)

In [None]:
# Schizophrenia Spectrum
df['dx_szspectrum'] = np.where((df['130874-0.0'].notna()) | 
                               (df['130884-0.0'].notna()) | 
                               (dx.apply(lambda row: row.astype(str).str.contains('F2').any(), axis=1)) | 
                               (dx.apply(lambda row: row.astype(str).str.contains('F30.2').any(), axis=1)) | 
                               (dx.apply(lambda row: row.astype(str).str.contains('F31.2').any(), axis=1)) | 
                               (dx.apply(lambda row: row.astype(str).str.contains('F31.5').any(), axis=1)) | 
                               (dx.apply(lambda row: row.astype(str).str.contains('F32.3').any(), axis=1)) | 
                               (dx.apply(lambda row: row.astype(str).str.contains('F33.3').any(), axis=1)) ,1, 0)

# Bipolar
df['dx_bp'] = np.where((df['130892-0.0'].notna()) |
                       (df['20126-0.0']==1) |
                       (df['20126-0.0']==2) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F30').any(), axis=1)) | 
                       (dx.apply(lambda row: row.astype(str).str.contains('F31').any(), axis=1)) , 1, 0)

# Depression
df['dx_dep'] = np.where((df['130896-0.0'].notna()) |
                        (df['20126-0.0']==3) |
                        (df['20126-0.0']==4) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F32').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F33').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F34').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F38').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F39').any(), axis=1)) , 1, 0)

# Anxiety
df['dx_anx'] = np.where((df['130904-0.0'].notna()) |
                        (df['130906-0.0'].notna()) |
                        (dx.apply(lambda row: row.astype(str).str.contains('F40').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F41').any(), axis=1)) , 1, 0)

# OCD
df['dx_ocd'] = np.where((df['130908-0.0'].notna()) |
                        (dx.apply(lambda row: row.astype(str).str.contains('F42').any(), axis=1)) , 1, 0)

# Stress-related & Adjustment
df['dx_stress_adjust'] = np.where((df['130910-0.0'].notna()) |
                                  (dx.apply(lambda row: row.astype(str).str.contains('F43').any(), axis=1)), 1, 0)

# Dissociative
df['dx_dissociative'] = np.where((df['130912-0.0'].notna()) |
                                 (dx.apply(lambda row: row.astype(str).str.contains('F44').any(), axis = 1)), 1, 0)

# Somatoform
df['dx_somatoform'] = np.where((df['130896-0.0'].notna()) |
                                 (dx.apply(lambda row: row.astype(str).str.contains('F45').any(), axis = 1)), 1, 0)

# Other Neurotic
df['dx_other_neurotic'] = np.where((df['130916-0.0'].notna()) |
                                 (dx.apply(lambda row: row.astype(str).str.contains('F48').any(), axis = 1)), 1, 0)

# Eating
df['dx_ed'] = np.where((df['130918-0.0'].notna()) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F50').any(), axis=1)),
                       1, 0)

# Non-Organic Sleep 
df['dx_sleep'] = np.where((df['130920-0.0'].notna()) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F51').any(), axis=1)),
                       1, 0)

# Non-Organic Sexual Dysfunction
df['dx_sexual_dys'] = np.where((df['130922-0.0'].notna()) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F52').any(), axis=1)),
                       1, 0)

# Puerperium
df['dx_puerperium'] = np.where((df['130924-0.0'].notna()) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F53').any(), axis=1)),
                       1, 0)

# Abuse of Non-dependence-Producing Substance
df['dx_abuse_non_dep'] = np.where((df['130928-0.0'].notna()) |
                       (dx.apply(lambda row: row.astype(str).str.contains('F54').any(), axis=1)),
                       1, 0)

# SUD
sud_dates = ['130854-0.0', '130856-0.0', '130858-0.0', '130860-0.0', 
             '130862-0.0', '130864-0.0', '130866-0.0', '130868-0.0', 
             '130870-0.0', '130872-0.0']
sud_codes = ['F10', 'F11', 'F12', 'F13', 'F14', 'F15', 'F16', 'F17', 'F18', 'F19']

df['dx_sud'] = np.where((df[sud_dates].apply(lambda row: row.notna().any(), axis=1)) |
                        (dx.apply(lambda row: row.astype(str).str.contains('|'.join(sud_codes)).any(), axis=1)),
                         1, 0)

# HABIT/IMPULSE
df['dx_hab_imp'] = np.where((df['130938-0.0'].notna()) |
                            (dx.apply(lambda row: row.astype(str).str.contains('F63').any(), axis=1)),
                            1, 0)

# ADHD
df['dx_adhd'] = np.where((df['130976-0.0'].notna()) |
                         (dx.apply(lambda row: row.astype(str).str.contains('F90').any(), axis=1)),
                            1, 0)

# Specific Personality 
df['dx_pd'] = np.where((df['130932-0.0'].notna()) |
                         (dx.apply(lambda row: row.astype(str).str.contains('F60').any(), axis=1)),
                            1, 0)

# Autism Spectrum
df['dx_asd'] = np.where((dx.apply(lambda row: row.astype(str).str.contains('F84.0').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F84.1').any(), axis=1)) | 
                        (dx.apply(lambda row: row.astype(str).str.contains('F84.5').any(), axis=1)) ,1, 0)

In [43]:
dx_type = df.iloc[:,-19:]

In [45]:
# Create Transdiagnostic Diagnosis Column
df['dx'] = np.where(dx_type.apply(lambda row: row.astype(str).str.contains('1').any(), axis=1), 1, 0)
df['dx'].value_counts()

dx
0    328374
1     85330
Name: count, dtype: int64

In [47]:
# Ensure Controls have no Category F Diagnoses

# Identify rows where dx == 0 (controls)
mask_controls = df['dx'] == 0
# Identify rows where any value in df.iloc[:, 35:294] starts with 'F'
mask_starts_with_f = df.iloc[:, 35:294].apply(lambda row: row.astype(str).str.startswith('F').any(), axis=1)
# Combine conditions to identify rows to drop
rows_to_drop = mask_controls & mask_starts_with_f
# Drop the rows from the original DataFrame
df = df[~rows_to_drop]  # Keep only rows that are not in the rows_to_drop mask

In [None]:
# Save Filtered Dataset
df.to_csv('ukbb_dx.csv')