In [1]:
import pandas as pd
import numpy as np

In [2]:
# load DisGenet curated raw data
df = pd.read_csv("../raw_data/curated_gene_disease_associations.tsv", sep='\t')

In [3]:
# Extract columns for disorders
diseases = df[['diseaseId', 'diseaseName']]

diseases = diseases.drop_duplicates()

# Find disorders for autism

## 1. Polygenetic autism spetcrum

In [4]:
poly_autism_terms = {}

### Autism spectrum

In [5]:
autism_spectrum = diseases[diseases['diseaseName'].str.lower().str.contains('autis')]
autism_spectrum.values

array([['C0004352', 'Autistic Disorder'],
       ['C1510586', 'Autism Spectrum Disorders'],
       ['C1854416', 'MACROCEPHALY/AUTISM SYNDROME'],
       ['C3275438', 'AUTISM, SUSCEPTIBILITY TO, X-LINKED 5'],
       ['C3552491', 'AUTISM, SUSCEPTIBILITY TO, 14A'],
       ['C4013764',
        'MENTAL RETARDATION WITH LANGUAGE IMPAIRMENT AND WITH OR WITHOUT AUTISTIC FEATURES'],
       ['C1845540', 'AUTISM, X-LINKED, SUSCEPTIBILITY TO, 1 (finding)'],
       ['C3550875', 'AUTISM, SUSCEPTIBILITY TO, X-LINKED 6'],
       ['C3554373', 'AUTISM, SUSCEPTIBILITY TO, 18']], dtype=object)

Delete syndromatic autism or ambiguous disorders
- C1854416: MACROCEPHALY/AUTISM SYNDROME
- C4013764: MENTAL RETARDATION WITH LANGUAGE IMPAIRMENT AND WITH OR WITHOUT AUTISTIC FEATURES

In [6]:
autism_spectrum = autism_spectrum[autism_spectrum['diseaseId'] != "C1854416"]
autism_spectrum = autism_spectrum[autism_spectrum['diseaseId'] != "C4013764"]

In [7]:
poly_autism_terms['Autism Spectrum Disorder'] = autism_spectrum

### Asperger's syndrome

In [8]:
asperger = diseases[diseases['diseaseName'].str.lower().str.contains('asper')]
asperger.values

array([['C0236792', 'Asperger Syndrome'],
       ['C1845341',
        'ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO, 1 (disorder)'],
       ['C1845334',
        'ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO, 2 (finding)'],
       ['C0004030', 'Aspergillosis']], dtype=object)

In [9]:
# delete irrelevant diagnosis
asperger = asperger[asperger['diseaseName'] != 'Aspergillosis']
asperger.values

array([['C0236792', 'Asperger Syndrome'],
       ['C1845341',
        'ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO, 1 (disorder)'],
       ['C1845334',
        'ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO, 2 (finding)']],
      dtype=object)

In [10]:
poly_autism_terms['Asperger Syndrome'] = asperger

### Find pervasive developmental disorder

In [11]:
pervasive_evelopmental_disorders = diseases[diseases['diseaseName'].str.lower().str.contains('development') & diseases['diseaseName'].str.lower().str.contains('pervasive')]
pervasive_evelopmental_disorders.values

array([['C0008074', 'Child Development Disorders, Pervasive'],
       ['C0524528', 'Pervasive Development Disorder']], dtype=object)

In [12]:
poly_autism_terms['Pervasive Development Disorder'] = pervasive_evelopmental_disorders

### Combine tables

In [13]:
for key, df in poly_autism_terms.items():
    df['autism_subtype'] = key

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['autism_subtype'] = key


In [14]:
autism_terms = pd.concat(poly_autism_terms.values(), 
                         ignore_index=True)
autism_terms.values

array([['C0004352', 'Autistic Disorder', 'Autism Spectrum Disorder'],
       ['C1510586', 'Autism Spectrum Disorders',
        'Autism Spectrum Disorder'],
       ['C3275438', 'AUTISM, SUSCEPTIBILITY TO, X-LINKED 5',
        'Autism Spectrum Disorder'],
       ['C3552491', 'AUTISM, SUSCEPTIBILITY TO, 14A',
        'Autism Spectrum Disorder'],
       ['C1845540', 'AUTISM, X-LINKED, SUSCEPTIBILITY TO, 1 (finding)',
        'Autism Spectrum Disorder'],
       ['C3550875', 'AUTISM, SUSCEPTIBILITY TO, X-LINKED 6',
        'Autism Spectrum Disorder'],
       ['C3554373', 'AUTISM, SUSCEPTIBILITY TO, 18',
        'Autism Spectrum Disorder'],
       ['C0236792', 'Asperger Syndrome', 'Asperger Syndrome'],
       ['C1845341',
        'ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO, 1 (disorder)',
        'Asperger Syndrome'],
       ['C1845334',
        'ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO, 2 (finding)',
        'Asperger Syndrome'],
       ['C0008074', 'Child Development Disorders, Pe

In [15]:
autism_terms['autism_subtype'] = 'Autism Spectrum Disorder'
autism_terms

Unnamed: 0,diseaseId,diseaseName,autism_subtype
0,C0004352,Autistic Disorder,Autism Spectrum Disorder
1,C1510586,Autism Spectrum Disorders,Autism Spectrum Disorder
2,C3275438,"AUTISM, SUSCEPTIBILITY TO, X-LINKED 5",Autism Spectrum Disorder
3,C3552491,"AUTISM, SUSCEPTIBILITY TO, 14A",Autism Spectrum Disorder
4,C1845540,"AUTISM, X-LINKED, SUSCEPTIBILITY TO, 1 (finding)",Autism Spectrum Disorder
5,C3550875,"AUTISM, SUSCEPTIBILITY TO, X-LINKED 6",Autism Spectrum Disorder
6,C3554373,"AUTISM, SUSCEPTIBILITY TO, 18",Autism Spectrum Disorder
7,C0236792,Asperger Syndrome,Autism Spectrum Disorder
8,C1845341,"ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO...",Autism Spectrum Disorder
9,C1845334,"ASPERGER SYNDROME, X-LINKED, SUSCEPTIBILITY TO...",Autism Spectrum Disorder


In [16]:
autism_terms['autism_subtype_broad'] = 'Non-Syndromic Autism'

## 2. Syndromic ASD

In [17]:
autism_syndromes = {}

### Childhood disintegrative disorder, aka, Heller syndrome

In [18]:
hellers_syndrome = diseases[
    (diseases['diseaseName'].str.lower().str.contains('disintegrat') & diseases['diseaseName'].str.lower().str.contains('disorder')) |
    (diseases['diseaseName'].str.lower().str.contains('heller')) |
    (diseases['diseaseName'].str.lower().str.contains('cdd'))
]

hellers_syndrome.values

array([], shape=(0, 2), dtype=object)

In [19]:
autism_syndromes['Heller Syndrome'] = hellers_syndrome

### Rett Syndrome - finally decided not to add

In [20]:
rett_syndrome = diseases[
    (diseases['diseaseName'].str.lower().str.contains('rett') & diseases['diseaseName'].str.lower().str.contains('syndrome')) &
    ~diseases['diseaseName'].isin(['Gilles de la Tourette syndrome', 'Baraitser Brett Piesowicz syndrome'])
]

rett_syndrome.values

array([['C0035372', 'Rett Syndrome'],
       ['C2748910', 'Rett Syndrome, Atypical'],
       ['C1839332', 'Rett Syndrome, Preserved Speech Variant'],
       ['C2677682', 'Rett Syndrome, Zappella Variant'],
       ['C4014821', 'PORETTI-BOLTSHAUSER SYNDROME']], dtype=object)

In [21]:
autism_syndromes["Rett Syndrome"] = rett_syndrome

### Fragile X syndrome

In [22]:
fragile_x = diseases[
    (diseases['diseaseName'].str.lower().str.contains('fragile') & diseases['diseaseName'].str.lower().str.contains('x'))
]

fragile_x.values

array([['C0016667', 'Fragile X Syndrome'],
       ['C1839780', 'FRAGILE X TREMOR/ATAXIA SYNDROME']], dtype=object)

In [23]:
autism_syndromes['Fragile X Syndrome'] = fragile_x

### MECP2 duplication syndrome

In [24]:
mecp2 = diseases[
    diseases['diseaseName'].str.lower().str.contains('mecp2')
]

mecp2.values

array([['C1968556',
        'ENCEPHALOPATHY, NEONATAL SEVERE, DUE TO MECP2 MUTATIONS']],
      dtype=object)

In [25]:
autism_syndromes['MECP2 duplication Syndrome'] = mecp2

### Tuberous sclerosis complex

In [26]:
ts = diseases[
    (diseases['diseaseName'].str.lower().str.contains('tuberous')) & (diseases['diseaseName'].str.lower().str.contains('sclerosis'))
]

ts.values

array([['C0041341', 'Tuberous Sclerosis'],
       ['C0265319', 'Fibrous skin tumor of tuberous sclerosis'],
       ['C1860707', 'TUBEROUS SCLEROSIS 2 (disorder)'],
       ['C1838327',
        'Polycystic kidneys, severe infantile with tuberous sclerosis'],
       ['C1854465', 'TUBEROUS SCLEROSIS 1 (disorder)']], dtype=object)

In [27]:
autism_syndromes['Tuberous Sclerosis Complex'] = ts

### Angelman's syndrome

In [28]:
angelman = diseases[
    diseases['diseaseName'].str.lower().str.contains('angelman')
]

angelman.values

array([['C0162635', 'Angelman Syndrome']], dtype=object)

In [29]:
autism_syndromes['Angelman Syndrome']=angelman

### phelan-mcdermid syndrome: not found

In [30]:
pm = diseases[
    (diseases['diseaseName'].str.lower().str.contains('phelan')) & (diseases['diseaseName'].str.lower().str.contains('pms'))
]

pm.values

array([], shape=(0, 2), dtype=object)

In [31]:
autism_syndromes['Phelan-Mcdermid Syndrome']=pm

### Timothy syndrome

In [32]:
timothy = diseases[
    diseases['diseaseName'].str.lower().str.contains('timothy')
]

timothy.values

array([['C1832916', 'Timothy syndrome']], dtype=object)

In [33]:
autism_syndromes['Timothy Syndrome']=timothy

### Smith-lemli-opitz syndrome

In [34]:
slo = diseases[
    (diseases['diseaseName'].str.lower().str.contains('smith')) & (diseases['diseaseName'].str.lower().str.contains('lemli'))
]

slo.values

array([['C0175694', 'Smith-Lemli-Opitz Syndrome'],
       ['C0282643', 'Smith-Lemli-Opitz Syndrome, Type I'],
       ['C0282644', 'Smith-Lemli-Opitz Syndrome, Type II']], dtype=object)

In [35]:
autism_syndromes['Smith-Lemli-Opitz Syndrome']=slo

### neurofibromatosis 

In [36]:
neurofibromatosis = diseases[
    diseases['diseaseName'].str.lower().str.contains('neurofibromatosis')
]

neurofibromatosis.values

array([['C0027831', 'Neurofibromatosis 1'],
       ['C0917817', 'Neurofibromatosis 3'],
       ['C1834235', 'NEUROFIBROMATOSIS, FAMILIAL SPINAL'],
       ['C2931482', 'Neurofibromatosis-Noonan syndrome'],
       ['C0027832', 'Neurofibromatosis 2'],
       ['C2931480',
        'Neurofibromatosis, Type 3, mixed central and peripheral'],
       ['C1969623', 'NEUROFIBROMATOSIS, TYPE 1-LIKE SYNDROME']],
      dtype=object)

Delete ambiguous disorder, as it should be labeled as Nooan syndrome in this dataset
- 'C2931482': 'Neurofibromatosis-Noonan syndrome'

In [37]:
neurofibromatosis = neurofibromatosis[neurofibromatosis['diseaseId'] != 'C2931482']
neurofibromatosis.values

array([['C0027831', 'Neurofibromatosis 1'],
       ['C0917817', 'Neurofibromatosis 3'],
       ['C1834235', 'NEUROFIBROMATOSIS, FAMILIAL SPINAL'],
       ['C0027832', 'Neurofibromatosis 2'],
       ['C2931480',
        'Neurofibromatosis, Type 3, mixed central and peripheral'],
       ['C1969623', 'NEUROFIBROMATOSIS, TYPE 1-LIKE SYNDROME']],
      dtype=object)

In [38]:
autism_syndromes['Neurofibromatosis']=neurofibromatosis

### PTEN hamartoma tumor syndrome

In [39]:
hamartoma = diseases[
    (diseases['diseaseName'].str.lower().str.contains('hamartoma')) & diseases['diseaseName'].str.lower().str.contains('syndrome')
]

hamartoma.values

array([['C0018553', 'Hamartoma Syndrome, Multiple'],
       ['C1959582', 'PTEN Hamartoma Tumor Syndrome']], dtype=object)

In [40]:
autism_syndromes['Hamartoma tumor Syndrome']=hamartoma

### Down syndrome

In [41]:
down = diseases[
    (diseases['diseaseName'].str.lower().str.contains('down')) & diseases['diseaseName'].str.lower().str.contains('syndrome')
]

down.values

array([['C0013080', 'Down Syndrome'],
       ['C0432416', 'Down Syndrome, Partial Trisomy 21'],
       ['C1860789', 'Leukemia, Megakaryoblastic, of Down Syndrome'],
       ['C1860788',
        'Transient Myeloproliferative Disorder of Down Syndrome'],
       ['C1832812',
        'Cataracts, Congenital, with Sensorineural Deafness, Down Syndrome-Like Facial Appearance, Short Stature, and Mental Retardation']],
      dtype=object)

In [42]:
autism_syndromes['Down Syndrome']=down

### Cohen's syndrome

In [43]:
cohen = diseases[
    (
        (diseases['diseaseName'].str.lower().str.contains('cohen')) | 
        (diseases['diseaseName'].str.lower().str.contains('pepper')) |
        (diseases['diseaseName'].str.lower().str.contains('cervenka'))
    ) & 
    (
        diseases['diseaseName'].str.lower().str.contains('syndrome')
    )

]

cohen.values

array([['C4479654', 'COHEN-GIBSON SYNDROME'],
       ['C0265223', 'Cohen syndrome']], dtype=object)

In [44]:
autism_syndromes['Cohen Syndrome']=cohen

### Cornelia de lange syndrome

In [45]:
cornelia = diseases[
    (
        (diseases['diseaseName'].str.lower().str.contains('cornelia')) | 
        (diseases['diseaseName'].str.lower().str.contains('brachmann')) |
        (diseases['diseaseName'].str.lower().str.contains('bushy'))
    ) & 
    (
        (diseases['diseaseName'].str.lower().str.contains('syndrome')) |
        (diseases['diseaseName'].str.lower().str.contains('dwarf'))
    )
]


cornelia.values

array([['C0270972', 'Cornelia De Lange Syndrome'],
       ['C1853099', 'Cornelia de Lange Syndrome 3'],
       ['C3553517', 'CORNELIA DE LANGE SYNDROME 4'],
       ['C3550903', 'CORNELIA DE LANGE SYNDROME 5']], dtype=object)

In [46]:
autism_syndromes['Cornelia de Lange Syndrome']=cornelia

### Charge syndrome

In [47]:
charge = diseases[
    (diseases['diseaseName'].str.lower().str.contains('charge')) & 
    ~(diseases['diseaseName'].str.lower().str.contains('discharge')) 
]


charge.values

array([['C0265354', 'CHARGE Syndrome'],
       ['C2936502', 'Familial CHARGE Syndrome']], dtype=object)

In [48]:
autism_syndromes['Charge Syndrome']=charge

### Noonan syndrome

In [49]:
noonan = diseases[
    (diseases['diseaseName'].str.lower().str.contains('noonan')) & 
    (diseases['diseaseName'].str.lower().str.contains('syndrome')) 
]

noonan.values

array([['C0028326', 'Noonan Syndrome'],
       ['C3150970', 'NOONAN SYNDROME 7'],
       ['C3150803',
        'NOONAN SYNDROME-LIKE DISORDER WITH OR WITHOUT JUVENILE MYELOMONOCYTIC LEUKEMIA'],
       ['C1860991', 'NOONAN SYNDROME 3'],
       ['C0036069', 'Saldino-Noonan Syndrome'],
       ['C2931482', 'Neurofibromatosis-Noonan syndrome'],
       ['C2750732', 'Noonan Syndrome 6'],
       ['C4479577',
        'NOONAN SYNDROME-LIKE DISORDER WITH LOOSE ANAGEN HAIR 2'],
       ['C1969057', 'Noonan Syndrome 5'],
       ['C3809233', 'NOONAN SYNDROME 8'],
       ['C1853120', 'Noonan Syndrome 4'],
       ['C4225282', 'NOONAN SYNDROME 9'],
       ['C1843181',
        'Noonan syndrome-like disorder with loose anagen hair'],
       ['C3501846', 'Noonan-Like Syndrome With Loose Anagen Hair'],
       ['C4478716',
        'NOONAN SYNDROME-LIKE DISORDER WITH LOOSE ANAGEN HAIR 1'],
       ['C4225280', 'NOONAN SYNDROME 10']], dtype=object)

In [50]:
autism_syndromes['Noonan Syndrome']=noonan

Find Williams syndrome

In [51]:
william = diseases[
    (
        (diseases['diseaseName'].str.lower().str.contains('william')) & 
        (diseases['diseaseName'].str.lower().str.contains('syndrome'))
    ) | (
        (diseases['diseaseName'].str.lower().str.contains('7q11')) & 
        (diseases['diseaseName'].str.lower().str.contains('deletion'))
    )
]

william.values

array([['C0175702', 'Williams Syndrome']], dtype=object)

In [52]:
autism_syndromes['William Syndrome']=william

### DiGeorge or 22q11.2 deletion syndrome

In [53]:
digeorge = diseases[
    (
        (diseases['diseaseName'].str.lower().str.contains('digeorge')) & 
        (diseases['diseaseName'].str.lower().str.contains('syndrome'))
    ) | (
        (diseases['diseaseName'].str.lower().str.contains('22q11')) & 
        (diseases['diseaseName'].str.lower().str.contains('deletion'))
    )
]

digeorge.values

array([['C0012236', 'DiGeorge Syndrome'],
       ['C2936346', '22q11 Deletion Syndrome'],
       ['C2678480', 'Chromosome 22q11.2 Deletion Syndrome, Distal']],
      dtype=object)

In [54]:
autism_syndromes['Digeorge Syndrome']=digeorge

### Macrocephaly/autism syndrome 

In [55]:
macrocephaly = diseases[diseases['diseaseName'].str.upper().str.contains('MACROCEPHALY/AUTISM SYNDROME')]

macrocephaly.values

array([['C1854416', 'MACROCEPHALY/AUTISM SYNDROME']], dtype=object)

In [56]:
autism_syndromes['Macrocephaly/Autism Syndrome ']=macrocephaly

#### Combine syndromic autism

In [57]:
for key, df in autism_syndromes.items():
    df['autism_subtype'] = key

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['autism_subtype'] = key
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['autism_subtype'] = key
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['autism_subtype'] = key
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [58]:
autism_syndromes['Fragile X Syndrome']

Unnamed: 0,diseaseId,diseaseName,autism_subtype
3422,C0016667,Fragile X Syndrome,Fragile X Syndrome
19131,C1839780,FRAGILE X TREMOR/ATAXIA SYNDROME,Fragile X Syndrome


In [59]:
autism_syndromes_df = pd.concat(autism_syndromes.values(), axis=0, ignore_index=True)

In [60]:
autism_syndromes_df['autism_subtype_broad'] = 'Syndromic Autism'
autism_syndromes_df

Unnamed: 0,diseaseId,diseaseName,autism_subtype,autism_subtype_broad
0,C0035372,Rett Syndrome,Rett Syndrome,Syndromic Autism
1,C2748910,"Rett Syndrome, Atypical",Rett Syndrome,Syndromic Autism
2,C1839332,"Rett Syndrome, Preserved Speech Variant",Rett Syndrome,Syndromic Autism
3,C2677682,"Rett Syndrome, Zappella Variant",Rett Syndrome,Syndromic Autism
4,C4014821,PORETTI-BOLTSHAUSER SYNDROME,Rett Syndrome,Syndromic Autism
5,C0016667,Fragile X Syndrome,Fragile X Syndrome,Syndromic Autism
6,C1839780,FRAGILE X TREMOR/ATAXIA SYNDROME,Fragile X Syndrome,Syndromic Autism
7,C1968556,"ENCEPHALOPATHY, NEONATAL SEVERE, DUE TO MECP2 ...",MECP2 duplication Syndrome,Syndromic Autism
8,C0041341,Tuberous Sclerosis,Tuberous Sclerosis Complex,Syndromic Autism
9,C0265319,Fibrous skin tumor of tuberous sclerosis,Tuberous Sclerosis Complex,Syndromic Autism


# Combine & Export to csv

In [61]:
df = pd.concat([autism_terms, autism_syndromes_df], axis=0)

In [62]:
df.head()

Unnamed: 0,diseaseId,diseaseName,autism_subtype,autism_subtype_broad
0,C0004352,Autistic Disorder,Autism Spectrum Disorder,Non-Syndromic Autism
1,C1510586,Autism Spectrum Disorders,Autism Spectrum Disorder,Non-Syndromic Autism
2,C3275438,"AUTISM, SUSCEPTIBILITY TO, X-LINKED 5",Autism Spectrum Disorder,Non-Syndromic Autism
3,C3552491,"AUTISM, SUSCEPTIBILITY TO, 14A",Autism Spectrum Disorder,Non-Syndromic Autism
4,C1845540,"AUTISM, X-LINKED, SUSCEPTIBILITY TO, 1 (finding)",Autism Spectrum Disorder,Non-Syndromic Autism


In [63]:
df.tail()

Unnamed: 0,diseaseId,diseaseName,autism_subtype,autism_subtype_broad
55,C0175702,Williams Syndrome,William Syndrome,Syndromic Autism
56,C0012236,DiGeorge Syndrome,Digeorge Syndrome,Syndromic Autism
57,C2936346,22q11 Deletion Syndrome,Digeorge Syndrome,Syndromic Autism
58,C2678480,"Chromosome 22q11.2 Deletion Syndrome, Distal",Digeorge Syndrome,Syndromic Autism
59,C1854416,MACROCEPHALY/AUTISM SYNDROME,Macrocephaly/Autism Syndrome,Syndromic Autism


In [64]:
df.to_csv('processed_data/autism_terms.csv', sep='\t')