In [1]:
import pandas as pd
from os import listdir

In [2]:
directory = "ados_datasets/"

#### Preprocessing
- We don't need the identifiers of the collectinos / patients to do the prediction
- Keep only the features that can have predictive power
- Remove all rows with NaNs
- Update the values, in accordance with: http://www.beginningwitha.com/downloads/ADOS-2%20Presentation.pdf
- Make the diagnosis a categorical data:
    - 1 for all patients on the specturm
    - 0 otherwise

#### Predicting
- Compare different algorithms
- K-fold cross validation (K=5) - at least

In [3]:
filename='current_data.txt'
txtname = directory+filename
data = pd.read_csv(txtname, sep="\t")

In [4]:
potentiallyUseful = ['age in months at the time of the interview/test/sampling/imaging.',
        'anxiety',
        'hand and finger and other complex mannerisms',
        'imagination/creativity', 
        'immediate echolalia',
        'quality of social overtures', 
        'self-injurious behavior',
        'sex of the subject', 
        'shared enjoyment in interaction',
        'tantrums, aggression, negative or disruptive behavior',
        'unusual eye contact',
        'diagnosis']

In [5]:
# Remove the rows with NaN values

print ('Number of rows with at least 1 NaN features')
print (data[data.isnull().sum(axis=1) > 0].shape[0])
data = data[data.isnull().sum(axis=1) == 0]
print (str(data.shape[0]) + ' rows left')

Number of rows with at least 1 NaN features
6392
15459 rows left


In [6]:
# remove the rows with unknown (9.) variables
# and make the 8. (not applicable) 0 (typical)
# and make the 3. (severe) 2 (not that severe)

for col in potentiallyUseful[2:]:
    data = data[data[col] != 9]
    data[col] = data[col].apply(lambda x: 0 if x == 8 else x)
    data[col] = data[col].apply(lambda x: 2 if x == 3 else x)   
print (str(data.shape[0]) + ' rows left')

14042 rows left


In [7]:
# Make the diagnosis column categorical

data['diagnosis'] = pd.to_numeric(data['diagnosis'][data['diagnosis'].apply(lambda x: x.isnumeric())]).apply(lambda x: 1 if x == 2 else x)
data['diagnosis'][(data['diagnosis']!=1) & (data['diagnosis']!=0)]=-1
data= data[data.diagnosis >= 0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
# Create a isMale column

data['isMale'] = data['sex of the subject'].apply(lambda x: 1 if x == 'M' else 0) 
potentiallyUseful.remove('sex of the subject')
potentiallyUseful.append('isMale')

In [9]:
for col in data[potentiallyUseful]:
    print("COLUMN:", col)
    print(data[col].value_counts())

COLUMN: age in months at the time of the interview/test/sampling/imaging.
36.0     469
24.0     368
37.0     284
25.0     256
38.0     188
84.0     129
72.0     123
110.0    118
40.0     117
34.0     113
60.0     109
48.0     108
96.0     107
41.0     106
26.0     102
39.0     100
50.0      99
49.0      96
126.0     95
42.0      93
35.0      92
33.0      89
75.0      87
103.0     84
99.0      83
62.0      83
134.0     82
94.0      81
32.0      81
83.0      79
        ... 
308.0      1
316.0      1
336.0      1
289.0      1
378.0      1
365.0      1
475.0      1
642.0      1
440.0      1
370.0      1
487.0      1
408.0      1
285.0      1
431.0      1
324.0      1
447.0      1
346.0      1
329.0      1
342.0      1
356.0      1
613.0      1
499.0      1
357.0      1
424.0      1
296.0      1
257.0      1
367.0      1
362.0      1
284.0      1
349.0      1
Name: age in months at the time of the interview/test/sampling/imaging., Length: 417, dtype: int64
COLUMN: anxiety
0.0    9673
1.0   

In [10]:
def update_columns_labels(data, new_columns):
    updated_data = data.rename(index=str, columns=new_columns)
    return updated_data

In [11]:
new_columns = {
    "age in months at the time of the interview/test/sampling/imaging.": "age_months",
    "anxiety": "anxiety",
    "hand and finger and other complex mannerisms": "hand_finger_mannerisms",
    "imagination/creativity": "imagination_creativity",
    "immediate echolalia": "immediate_echolalia",
    "quality of social overtures": "quality_social_overtures",
    "self-injurious behavior": "self_injurious_behavior",
    "shared enjoyment in interaction": "shared_enjoyment_interaction",
    "tantrums, aggression, negative or disruptive behavior": "tantrums_aggression_disruptive_behavior",
    "unusual eye contact": "unusual_eye_contact",
    "diagnosis": "diagnosis",
    "isMale": "is_male"
}
update_columns_labels(data[potentiallyUseful], new_columns).to_csv('ados_datasets/data.csv', sep=';', index=False)

In [12]:
data[potentiallyUseful].corr()

Unnamed: 0,age in months at the time of the interview/test/sampling/imaging.,anxiety,hand and finger and other complex mannerisms,imagination/creativity,immediate echolalia,quality of social overtures,self-injurious behavior,shared enjoyment in interaction,"tantrums, aggression, negative or disruptive behavior",unusual eye contact,diagnosis,isMale
age in months at the time of the interview/test/sampling/imaging.,1.0,0.056746,-0.154821,-0.117888,-0.203015,0.028345,-0.0306,0.106497,-0.238769,0.057719,0.167637,0.048749
anxiety,0.056746,1.0,0.006214,0.097814,0.032011,0.056955,0.039881,0.077582,0.093,0.026523,0.066082,-0.010841
hand and finger and other complex mannerisms,-0.154821,0.006214,1.0,0.312081,0.19833,0.372907,0.158569,0.206366,0.148109,0.287556,0.201572,0.059204
imagination/creativity,-0.117888,0.097814,0.312081,1.0,0.166527,0.526446,0.151514,0.380416,0.267802,0.320025,0.252941,0.12582
immediate echolalia,-0.203015,0.032011,0.19833,0.166527,1.0,0.200621,0.04259,0.035387,0.141997,0.165302,0.111972,0.025785
quality of social overtures,0.028345,0.056955,0.372907,0.526446,0.200621,1.0,0.165565,0.52755,0.275338,0.586259,0.516583,0.140522
self-injurious behavior,-0.0306,0.039881,0.158569,0.151514,0.04259,0.165565,1.0,0.108376,0.220866,0.113599,0.065916,0.033753
shared enjoyment in interaction,0.106497,0.077582,0.206366,0.380416,0.035387,0.52755,0.108376,1.0,0.181887,0.386919,0.314767,0.117742
"tantrums, aggression, negative or disruptive behavior",-0.238769,0.093,0.148109,0.267802,0.141997,0.275338,0.220866,0.181887,1.0,0.142278,0.101401,0.061314
unusual eye contact,0.057719,0.026523,0.287556,0.320025,0.165302,0.586259,0.113599,0.386919,0.142278,1.0,0.536389,0.144237


In [15]:
data[data['age in months at the time of the interview/test/sampling/imaging.']<=24].count()

ados diagnosis classification                                                                      575
age in months at the time of the interview/test/sampling/imaging.                                  575
anxiety                                                                                            575
collection_id                                                                                      575
collection_title                                                                                   575
dataset_id                                                                                         575
date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy    575
hand and finger and other complex mannerisms                                                       575
imagination/creativity                                                                             575
immediate echolalia                                                      

In [16]:
data.columns

Index(['ados diagnosis classification',
       'age in months at the time of the interview/test/sampling/imaging.',
       'anxiety', 'collection_id', 'collection_title', 'dataset_id',
       'date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy',
       'hand and finger and other complex mannerisms',
       'imagination/creativity', 'immediate echolalia',
       'overall ados diagnosis', 'overall level of non-echoed language',
       'promoted_subjectkey', 'quality of social overtures',
       'self-injurious behavior', 'sex of the subject',
       'shared enjoyment in interaction',
       'stereotyped/idiosyncratic use of words or phrases',
       'subject id how it's defined in lab/project',
       'tantrums, aggression, negative or disruptive behavior',
       'the ndar global unique identifier (guid) for research subject',
       'unusual eye contact', 'diagnosis', 'isMale'],
      dtype='object')