In [1]:
import pandas as pd
from os import listdir

In [2]:
directory = "ados_datasets_all/"

#### Preprocessing
- We don't need the identifiers of the collectinos / patients to do the prediction
- Keep only the features that can have predictive power
- Remove all rows with NaNs
- Update the values, in accordance with: http://www.beginningwitha.com/downloads/ADOS-2%20Presentation.pdf
- Make the diagnosis a categorical data:
    - 1 for all patients on the specturm
    - 0 otherwise

#### Predicting
- Compare different algorithms
- K-fold cross validation (K=5) - at least

In [3]:
filename='current_data.txt'
txtname = directory+filename
data = pd.read_csv(txtname, sep="\t")

In [4]:
potentiallyUseful = ['age in months at the time of the interview/test/sampling/imaging.',
        'anxiety',
        'hand and finger and other complex mannerisms',
        'imagination/creativity', 
        'immediate echolalia',
        'quality of social overtures', 
        'self-injurious behavior',
        'sex of the subject', 
        'shared enjoyment in interaction',
        'tantrums, aggression, negative or disruptive behavior',
        'unusual eye contact',
        'diagnosis']

In [5]:
# Remove the rows with NaN values

print ('Number of rows with at least 1 NaN features')
print (data[data.isnull().sum(axis=1) > 0].shape[0])
data = data[data.isnull().sum(axis=1) == 0]
print (str(data.shape[0]) + ' rows left')

Number of rows with at least 1 NaN features
5959
15892 rows left


In [6]:
# remove the rows with unknown (9.) variables
# and make the 8. (not applicable) 0 (typical)
# and make the 3. (severe) 2 (not that severe)

for col in potentiallyUseful[2:]:
    data = data[data[col] != 9]
    data[col] = data[col].apply(lambda x: 0 if x == 8 else x)
    data[col] = data[col].apply(lambda x: 2 if x == 3 else x)   
print (str(data.shape[0]) + ' rows left')

14401 rows left


In [7]:
# Make the diagnosis column categorical

data['diagnosis'] = pd.to_numeric(data['diagnosis'][data['diagnosis'].apply(lambda x: x.isnumeric())]).apply(lambda x: 1 if x == 2 else x)
data['diagnosis'][(data['diagnosis']!=1) & (data['diagnosis']!=0)]=-1
data= data[data.diagnosis >= 0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
# Create a isMale column

data['isMale'] = data['sex of the subject'].apply(lambda x: 1 if x == 'M' else 0) 
potentiallyUseful.remove('sex of the subject')
potentiallyUseful.append('isMale')

In [9]:
for col in data[potentiallyUseful]:
    print("COLUMN:", col)
    print(data[col].value_counts())

COLUMN: age in months at the time of the interview/test/sampling/imaging.
36.0     469
24.0     368
37.0     284
25.0     256
38.0     188
84.0     130
110.0    125
72.0     125
40.0     117
96.0     114
34.0     113
60.0     109
48.0     108
41.0     106
126.0    103
26.0     102
39.0     100
50.0      99
49.0      96
42.0      93
35.0      92
103.0     91
33.0      89
108.0     89
75.0      88
99.0      86
83.0      85
94.0      85
98.0      84
134.0     84
        ... 
475.0      1
357.0      1
393.0      1
340.0      1
470.0      1
579.0      1
257.0      1
490.0      1
481.0      1
365.0      1
540.0      1
370.0      1
263.0      1
378.0      1
267.0      1
516.0      1
349.0      1
576.0      1
304.0      1
412.0      1
426.0      1
346.0      1
308.0      1
510.0      1
433.0      1
456.0      1
499.0      1
374.0      1
301.0      1
258.0      1
Name: age in months at the time of the interview/test/sampling/imaging., Length: 439, dtype: int64
COLUMN: anxiety
0.0    9987
1.0   

In [10]:
def update_columns_labels(data, new_columns):
    updated_data = data.rename(index=str, columns=new_columns)
    return updated_data

In [11]:
new_columns = {
    "age in months at the time of the interview/test/sampling/imaging.": "age_months",
    "anxiety": "anxiety",
    "hand and finger and other complex mannerisms": "hand_finger_mannerisms",
    "imagination/creativity": "imagination_creativity",
    "immediate echolalia": "immediate_echolalia",
    "quality of social overtures": "quality_social_overtures",
    "self-injurious behavior": "self_injurious_behavior",
    "shared enjoyment in interaction": "shared_enjoyment_interaction",
    "tantrums, aggression, negative or disruptive behavior": "tantrums_aggression_disruptive_behavior",
    "unusual eye contact": "unusual_eye_contact",
    "diagnosis": "diagnosis",
    "isMale": "is_male"
}
update_columns_labels(data[potentiallyUseful], new_columns).to_csv('ados_datasets_all/data.csv', sep=';', index=False)

In [12]:
data[potentiallyUseful].corr()

Unnamed: 0,age in months at the time of the interview/test/sampling/imaging.,anxiety,hand and finger and other complex mannerisms,imagination/creativity,immediate echolalia,quality of social overtures,self-injurious behavior,shared enjoyment in interaction,"tantrums, aggression, negative or disruptive behavior",unusual eye contact,diagnosis,isMale
age in months at the time of the interview/test/sampling/imaging.,1.0,0.067794,-0.154159,-0.115431,-0.201017,0.026853,-0.034192,0.09965,-0.236583,0.053761,0.169492,0.049366
anxiety,0.067794,1.0,0.004422,0.096542,0.029169,0.05673,0.039161,0.07585,0.090023,0.027491,0.065079,-0.009394
hand and finger and other complex mannerisms,-0.154159,0.004422,1.0,0.308297,0.199849,0.367977,0.157873,0.198913,0.149204,0.280814,0.195564,0.055404
imagination/creativity,-0.115431,0.096542,0.308297,1.0,0.171202,0.523083,0.150266,0.378542,0.264931,0.319486,0.250249,0.123735
immediate echolalia,-0.201017,0.029169,0.199849,0.171202,1.0,0.202054,0.043963,0.036783,0.144207,0.165556,0.108605,0.02423
quality of social overtures,0.026853,0.05673,0.367977,0.523083,0.202054,1.0,0.163706,0.524976,0.275487,0.581358,0.513961,0.1372
self-injurious behavior,-0.034192,0.039161,0.157873,0.150266,0.043963,0.163706,1.0,0.105979,0.220799,0.113647,0.064132,0.032677
shared enjoyment in interaction,0.09965,0.07585,0.198913,0.378542,0.036783,0.524976,0.105979,1.0,0.182298,0.38389,0.317042,0.116489
"tantrums, aggression, negative or disruptive behavior",-0.236583,0.090023,0.149204,0.264931,0.144207,0.275487,0.220799,0.182298,1.0,0.143344,0.098603,0.058946
unusual eye contact,0.053761,0.027491,0.280814,0.319486,0.165556,0.581358,0.113647,0.38389,0.143344,1.0,0.532292,0.141249


In [13]:
data[data['age in months at the time of the interview/test/sampling/imaging.']<=24].count()

ados diagnosis classification                                                                      575
age in months at the time of the interview/test/sampling/imaging.                                  575
anxiety                                                                                            575
collection_id                                                                                      575
collection_title                                                                                   575
dataset_id                                                                                         575
date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy    575
hand and finger and other complex mannerisms                                                       575
imagination/creativity                                                                             575
immediate echolalia                                                      

In [14]:
data.columns

Index(['ados diagnosis classification',
       'age in months at the time of the interview/test/sampling/imaging.',
       'anxiety', 'collection_id', 'collection_title', 'dataset_id',
       'date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy',
       'hand and finger and other complex mannerisms',
       'imagination/creativity', 'immediate echolalia',
       'overall ados diagnosis', 'promoted_subjectkey',
       'quality of social overtures', 'self-injurious behavior',
       'sex of the subject', 'shared enjoyment in interaction',
       'stereotyped/idiosyncratic use of words or phrases',
       'subject id how it's defined in lab/project',
       'tantrums, aggression, negative or disruptive behavior',
       'the ndar global unique identifier (guid) for research subject',
       'unusual eye contact', 'diagnosis', 'isMale'],
      dtype='object')