In [1]:
import pandas as pd
import numpy as np

## Load data

In [2]:
# load DisGenet curated raw data
df = pd.read_csv("../raw_data/curated_gene_disease_associations.tsv", sep='\t')
df.head(1)

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,,2017.0,2017.0,1,0,CTD_human


In [3]:
# Create diseases 
diseases = df[['diseaseId', 'diseaseName', 'diseaseType', 'diseaseClass', 'diseaseSemanticType']].drop_duplicates()
diseases.head()

Unnamed: 0,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType
0,C0019209,Hepatomegaly,phenotype,C06;C23,Finding
1,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction
2,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome
3,C0007102,Malignant tumor of colon,disease,C04;C06,Neoplastic Process
4,C0009375,Colonic Neoplasms,group,C04;C06,Neoplastic Process


In [4]:
# load autism terms
autisms = pd.read_csv('processed_data/autism_terms.csv', sep='\t').iloc[:,1:]
autisms.head()

Unnamed: 0,diseaseId,diseaseName,autism_subtype,autism_subtype_broad
0,C0004352,Autistic Disorder,Autism Spectrum Disorder,Non-Syndromic Autism
1,C1510586,Autism Spectrum Disorders,Autism Spectrum Disorder,Non-Syndromic Autism
2,C3275438,"AUTISM, SUSCEPTIBILITY TO, X-LINKED 5",Autism Spectrum Disorder,Non-Syndromic Autism
3,C3552491,"AUTISM, SUSCEPTIBILITY TO, 14A",Autism Spectrum Disorder,Non-Syndromic Autism
4,C1845540,"AUTISM, X-LINKED, SUSCEPTIBILITY TO, 1 (finding)",Autism Spectrum Disorder,Non-Syndromic Autism


In [5]:
autisms['diseaseId'] = autisms['diseaseId'].astype(str)
diseases['diseaseId'] = diseases['diseaseId'].astype(str)

In [6]:
autisms_df = autisms.merge(
    diseases[['diseaseId', 'diseaseType', 'diseaseClass', 'diseaseSemanticType']],
    on="diseaseId",
    how='left')

autisms_df.tail()

Unnamed: 0,diseaseId,diseaseName,autism_subtype,autism_subtype_broad,diseaseType,diseaseClass,diseaseSemanticType
67,C0175702,Williams Syndrome,William Syndrome,Syndromic Autism,disease,C10;C14;C16,Disease or Syndrome
68,C0012236,DiGeorge Syndrome,Digeorge Syndrome,Syndromic Autism,disease,C05;C14;C15;C16;C19,Disease or Syndrome
69,C2936346,22q11 Deletion Syndrome,Digeorge Syndrome,Syndromic Autism,disease,C05;C14;C15;C16;C19,Disease or Syndrome
70,C2678480,"Chromosome 22q11.2 Deletion Syndrome, Distal",Digeorge Syndrome,Syndromic Autism,disease,C05;C14;C15;C16;C19;C23,Disease or Syndrome
71,C1854416,MACROCEPHALY/AUTISM SYNDROME,Macrocephaly/Autism Syndrome,Syndromic Autism,disease,C05;C10;C16;C23;F03,Disease or Syndrome


## Create col: disease ids, name, autism_subtype

In [7]:
autism_types_names = pd.Series(autisms_df['autism_subtype'].unique())

autism_types_new_id = "A" + pd.Series([str(i).zfill(2) for i in range(0, len(autism_types_names))])
autism_new_ids = pd.concat([autism_types_names, autism_types_new_id], axis=1)
autism_new_ids = autism_new_ids.rename(columns={0: 'diseaseName', 1:"diseaseId"})
autism_new_ids

Unnamed: 0,diseaseName,diseaseId
0,Autism Spectrum Disorder,A00
1,Rett Syndrome,A01
2,Fragile X Syndrome,A02
3,MECP2 duplication Syndrome,A03
4,Tuberous Sclerosis Complex,A04
5,Angelman Syndrome,A05
6,Timothy Syndrome,A06
7,Smith-Lemli-Opitz Syndrome,A07
8,Neurofibromatosis,A08
9,Hamartoma tumor Syndrome,A09


In [8]:
# Create new columns
autism_new_ids.loc[0, 'autism_subtype_broad'] = 'Non-Syndromic Autism'
autism_new_ids.loc[1:, 'autism_subtype_broad'] = 'Syndromic Autism'
autism_new_ids.head(2)

Unnamed: 0,diseaseName,diseaseId,autism_subtype_broad
0,Autism Spectrum Disorder,A00,Non-Syndromic Autism
1,Rett Syndrome,A01,Syndromic Autism


## Create col: disease type

In [9]:

autisms_df[['autism_subtype', 'diseaseType']].drop_duplicates()

Unnamed: 0,autism_subtype,diseaseType
0,Autism Spectrum Disorder,group
1,Autism Spectrum Disorder,disease
2,Autism Spectrum Disorder,phenotype
12,Rett Syndrome,disease
17,Fragile X Syndrome,disease
19,MECP2 duplication Syndrome,disease
20,Tuberous Sclerosis Complex,disease
25,Angelman Syndrome,disease
26,Timothy Syndrome,disease
27,Smith-Lemli-Opitz Syndrome,disease


In [10]:
autism_new_ids['diseaseType'] = 'disease'
autism_new_ids.head(2)

Unnamed: 0,diseaseName,diseaseId,autism_subtype_broad,diseaseType
0,Autism Spectrum Disorder,A00,Non-Syndromic Autism,disease
1,Rett Syndrome,A01,Syndromic Autism,disease


## Create col: diseaseClass

In [11]:
autism_classes = autisms_df[['autism_subtype', 'diseaseClass']].drop_duplicates()
autism_classes.head()

Unnamed: 0,autism_subtype,diseaseClass
0,Autism Spectrum Disorder,F03
2,Autism Spectrum Disorder,
12,Rett Syndrome,C10;C16
16,Rett Syndrome,
17,Fragile X Syndrome,C10;C16


In [12]:
for i, row in autism_new_ids.iterrows(): 
    df_temp = autism_classes[autism_classes['autism_subtype']==row['diseaseName']]
    list_temp = []
    for _, row_2 in df_temp.iterrows():
        if row_2['diseaseClass'] is not np.nan:
            classes = row_2['diseaseClass']
            if ";" in classes:
                list_temp += (classes.split(';'))
            else:
                list_temp.append(classes)
    list_temp = sorted(list(set(list_temp)))
    print(list_temp)
    autism_new_ids.loc[i, 'diseaseClass'] = ";".join(list_temp)

['F03']
['C10', 'C16']
['C10', 'C16', 'C23']
['C10', 'C16']
['C04', 'C10', 'C12', 'C13', 'C16']
['C10', 'C16']
['C05', 'C14', 'C16', 'C23', 'F03']
['C16', 'C18']
['C04', 'C09', 'C10', 'C16', 'C17', 'C23']
['C04', 'C16']
['C04', 'C09', 'C10', 'C11', 'C15', 'C16', 'C23', 'F01', 'F03']
['C05', 'C10', 'C11', 'C16', 'C18', 'C23', 'F01', 'F03']
['C10', 'C16']
['C16']
['C04', 'C05', 'C10', 'C14', 'C16', 'C17', 'C23']
['C10', 'C14', 'C16']
['C05', 'C14', 'C15', 'C16', 'C19', 'C23']
['C05', 'C10', 'C16', 'C23', 'F03']


### Create col: 'diseaseSemanticType'

In [13]:
autism_semantictypes = autisms_df[['autism_subtype', 'diseaseSemanticType']].drop_duplicates()
autism_semantictypes = autism_semantictypes.reset_index().iloc[:, 1:]
autism_semantictypes

Unnamed: 0,autism_subtype,diseaseSemanticType
0,Autism Spectrum Disorder,Mental or Behavioral Dysfunction
1,Autism Spectrum Disorder,Finding
2,Autism Spectrum Disorder,Disease or Syndrome
3,Rett Syndrome,Disease or Syndrome
4,Fragile X Syndrome,Disease or Syndrome
5,MECP2 duplication Syndrome,Disease or Syndrome
6,Tuberous Sclerosis Complex,Neoplastic Process
7,Tuberous Sclerosis Complex,Disease or Syndrome
8,Angelman Syndrome,Disease or Syndrome
9,Timothy Syndrome,Disease or Syndrome


In [14]:
def relabel(autism_type, semantic_type):
    autism_semantictypes.loc[autism_semantictypes['autism_subtype']==autism_type, 'diseaseSemanticType'] = semantic_type

In [15]:
relabel('Autism Spectrum Disorder', 'Mental or Behavioral Dysfunction')
relabel('Tuberous Sclerosis Complex', 'Disease or Syndrome')
relabel('Neurofibromatosis', 'Disease or Syndrome')
relabel('Down Syndrome', 'Disease or Syndrome')
relabel('Cohen Syndrome', 'Congenital Abnormality')

In [16]:
autism_semantictypes = autism_semantictypes.drop_duplicates().reset_index(drop=True)
autism_semantictypes

Unnamed: 0,autism_subtype,diseaseSemanticType
0,Autism Spectrum Disorder,Mental or Behavioral Dysfunction
1,Rett Syndrome,Disease or Syndrome
2,Fragile X Syndrome,Disease or Syndrome
3,MECP2 duplication Syndrome,Disease or Syndrome
4,Tuberous Sclerosis Complex,Disease or Syndrome
5,Angelman Syndrome,Disease or Syndrome
6,Timothy Syndrome,Disease or Syndrome
7,Smith-Lemli-Opitz Syndrome,Disease or Syndrome
8,Neurofibromatosis,Disease or Syndrome
9,Hamartoma tumor Syndrome,Neoplastic Process


In [17]:
autism_new_ids = autism_new_ids.merge(autism_semantictypes, left_on='diseaseName', right_on='autism_subtype', how='inner')
autism_new_ids = autism_new_ids.drop('autism_subtype', axis=1)

In [18]:
autism_new_ids

Unnamed: 0,diseaseName,diseaseId,autism_subtype_broad,diseaseType,diseaseClass,diseaseSemanticType
0,Autism Spectrum Disorder,A00,Non-Syndromic Autism,disease,F03,Mental or Behavioral Dysfunction
1,Rett Syndrome,A01,Syndromic Autism,disease,C10;C16,Disease or Syndrome
2,Fragile X Syndrome,A02,Syndromic Autism,disease,C10;C16;C23,Disease or Syndrome
3,MECP2 duplication Syndrome,A03,Syndromic Autism,disease,C10;C16,Disease or Syndrome
4,Tuberous Sclerosis Complex,A04,Syndromic Autism,disease,C04;C10;C12;C13;C16,Disease or Syndrome
5,Angelman Syndrome,A05,Syndromic Autism,disease,C10;C16,Disease or Syndrome
6,Timothy Syndrome,A06,Syndromic Autism,disease,C05;C14;C16;C23;F03,Disease or Syndrome
7,Smith-Lemli-Opitz Syndrome,A07,Syndromic Autism,disease,C16;C18,Disease or Syndrome
8,Neurofibromatosis,A08,Syndromic Autism,disease,C04;C09;C10;C16;C17;C23,Disease or Syndrome
9,Hamartoma tumor Syndrome,A09,Syndromic Autism,disease,C04;C16,Neoplastic Process


## Group autisms in raw data 

In [19]:
autism_new_ids.head(1)

Unnamed: 0,diseaseName,diseaseId,autism_subtype_broad,diseaseType,diseaseClass,diseaseSemanticType
0,Autism Spectrum Disorder,A00,Non-Syndromic Autism,disease,F03,Mental or Behavioral Dysfunction


In [20]:
old_autism_keys = autisms[['diseaseId', 'autism_subtype']]
old_autism_keys = old_autism_keys.rename(columns={'diseaseId': 'diseaseId_old', 'autism_subtype': 'diseaseName'})

autism_new_ids = autism_new_ids.merge(old_autism_keys, on='diseaseName', how='left')

In [21]:
autism_new_ids.head(1)

Unnamed: 0,diseaseName,diseaseId,autism_subtype_broad,diseaseType,diseaseClass,diseaseSemanticType,diseaseId_old
0,Autism Spectrum Disorder,A00,Non-Syndromic Autism,disease,F03,Mental or Behavioral Dysfunction,C0004352


In [22]:
autism_new_ids = autism_new_ids.rename(columns={
    'diseaseName': 'new_diseaseName',
    'diseaseId': 'new_diseaseId',
    'diseaseType': 'new_diseaseType',
    'diseaseClass': 'new_diseaseClass',
    'diseaseSemanticType': 'new_diseaseSemanticType',
    'diseaseId_old': 'diseaseId'
})

In [23]:
merged_df = df.merge(autism_new_ids, on='diseaseId',how='left')
merged_df.head(1)

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,...,YearFinal,NofPmids,NofSnps,source,new_diseaseName,new_diseaseId,autism_subtype_broad,new_diseaseType,new_diseaseClass,new_diseaseSemanticType
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,...,2017.0,1,0,CTD_human,,,,,,


In [24]:
merged_df['diseaseId'] = merged_df['new_diseaseId'].combine_first(merged_df['diseaseId'])
merged_df['diseaseName'] = merged_df['new_diseaseName'].combine_first(merged_df['diseaseName'])
merged_df['diseaseType'] = merged_df['new_diseaseType'].combine_first(merged_df['diseaseType'])
merged_df['diseaseClass'] = merged_df['new_diseaseClass'].combine_first(merged_df['diseaseClass'])
merged_df['diseaseSemanticType'] = merged_df['new_diseaseSemanticType'].combine_first(merged_df['diseaseSemanticType'])
merged_df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,...,YearFinal,NofPmids,NofSnps,source,new_diseaseName,new_diseaseId,autism_subtype_broad,new_diseaseType,new_diseaseClass,new_diseaseSemanticType
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,...,2017.0,1,0,CTD_human,,,,,,
1,1,A1BG,0.857,0.172,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,...,2015.0,1,0,CTD_human,,,,,,
2,2,A2M,0.564,0.724,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.4,...,2016.0,3,0,CTD_human,,,,,,
3,2,A2M,0.564,0.724,C0007102,Malignant tumor of colon,disease,C04;C06,Neoplastic Process,0.3,...,2004.0,1,0,CTD_human,,,,,,
4,2,A2M,0.564,0.724,C0009375,Colonic Neoplasms,group,C04;C06,Neoplastic Process,0.3,...,2004.0,1,0,CTD_human,,,,,,


In [25]:
merged_df['autism_subtype_broad'] = merged_df['autism_subtype_broad'].fillna('Not Autism')

In [26]:
grouped_df = merged_df.drop([
    'new_diseaseName',
    'new_diseaseId',
    'new_diseaseType',
    'new_diseaseClass',
    'new_diseaseSemanticType'
    ], axis=1)

In [27]:
grouped_df.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source,autism_subtype_broad
0,1,A1BG,0.857,0.172,C0019209,Hepatomegaly,phenotype,C06;C23,Finding,0.3,,2017.0,2017.0,1,0,CTD_human,Not Autism
1,1,A1BG,0.857,0.172,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.3,,2015.0,2015.0,1,0,CTD_human,Not Autism
2,2,A2M,0.564,0.724,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.4,0.848485,1998.0,2016.0,3,0,CTD_human,Not Autism
3,2,A2M,0.564,0.724,C0007102,Malignant tumor of colon,disease,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human,Not Autism
4,2,A2M,0.564,0.724,C0009375,Colonic Neoplasms,group,C04;C06,Neoplastic Process,0.3,,2004.0,2004.0,1,0,CTD_human,Not Autism


In [31]:
grouped_df['autism_subtype_broad'].unique()

array(['Not Autism', 'Non-Syndromic Autism', 'Syndromic Autism'],
      dtype=object)

# Export data

In [32]:
grouped_df.to_csv('processed_data/curated_gene_disease_associations_autism_grouped.csv', sep='\t')