In [1]:
import pandas as pd
from os import listdir

In [2]:
directory = "ados_datasets/"

In [3]:
filename='current_data.txt'
txtname = directory+filename
data = pd.read_csv(txtname, sep="\t")

In [4]:
data.columns

Index(['ados diagnosis classification',
       'age in months at the time of the interview/test/sampling/imaging.',
       'anxiety', 'collection_id', 'collection_title', 'dataset_id',
       'date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy',
       'hand and finger and other complex mannerisms',
       'imagination/creativity', 'immediate echolalia',
       'overall ados diagnosis', 'promoted_subjectkey',
       'quality of social overtures', 'self-injurious behavior',
       'sex of the subject', 'shared enjoyment in interaction',
       'subject id how it's defined in lab/project',
       'tantrums, aggression, negative or disruptive behavior',
       'the ndar global unique identifier (guid) for research subject',
       'unusual eye contact', 'diagnosis'],
      dtype='object')

#### Preprocessing
- We don't need the identifiers of the collectinos / patients to do the prediction
- Keep only the features that can have predictive power
- Remove all rows with NaNs
- Update the values, in accordance with: http://www.beginningwitha.com/downloads/ADOS-2%20Presentation.pdf
- Make the diagnosis a categorical data:
    - 1 for all patients on the specturm
    - 0 otherwise

#### Predicting
- Compare different algorithms
- K-fold cross validation (K=5) - at least

In [5]:
potentiallyUseful = ['age in months at the time of the interview/test/sampling/imaging.',
        'anxiety',
        'hand and finger and other complex mannerisms',
        'imagination/creativity', 
        'immediate echolalia',
        'quality of social overtures', 
        'self-injurious behavior',
        'sex of the subject', 
        'shared enjoyment in interaction',
        'tantrums, aggression, negative or disruptive behavior',
        'unusual eye contact',
        'diagnosis']

In [6]:
# Remove the rows with NaN values

print ('Number of rows with at least 1 NaN features')
print (data[data.isnull().sum(axis=1) > 0].shape[0])
data = data[data.isnull().sum(axis=1) == 0]
print (str(data.shape[0]) + ' rows left')

Number of rows with at least 1 NaN features
1339
2733 rows left


In [7]:
# remove the rows with unknown (9.) variables
# and make the 8. (not applicable) 0 (typical)
# and make the 3. (severe) 2 (not that severe)

for col in potentiallyUseful[2:]:
    data = data[data[col] != 9]
    data[col] = data[col].apply(lambda x: 0 if x == 8 else x)
    data[col] = data[col].apply(lambda x: 2 if x == 3 else x)    

In [8]:
# Make the diagnosis column categorical

data['diagnosis'] = data['diagnosis'].apply(lambda x: 1 if x == 2 else x) 
data= data[data.diagnosis >= 0]

In [9]:
# Create a isMale column

data['isMale'] = data['sex of the subject'].apply(lambda x: 1 if x == 'M' else 0) 
potentiallyUseful.remove('sex of the subject')
potentiallyUseful.append('isMale')

In [10]:
for col in data[potentiallyUseful]:
    print("COLUMN:", col)
    print(data[col].value_counts())

COLUMN: age in months at the time of the interview/test/sampling/imaging.
24     271
36     195
25     155
31     127
38     118
33     112
37     100
34      98
32      95
26      87
35      82
40      74
30      68
41      53
39      40
42      40
28      39
29      38
27      32
49      28
45      27
46      26
44      24
23      24
18      23
50      21
43      19
47      19
61      18
48      17
      ... 
105      1
109      1
125      1
127      1
145      1
100      1
147      1
151      1
157      1
159      1
203      1
215      1
232      1
220      1
219      1
208      1
206      1
194      1
186      1
184      1
180      1
178      1
176      1
148      1
146      1
142      1
132      1
122      1
120      1
14       1
Name: age in months at the time of the interview/test/sampling/imaging., Length: 139, dtype: int64
COLUMN: anxiety
0.0    2159
1.0     449
2.0      34
Name: anxiety, dtype: int64
COLUMN: hand and finger and other complex mannerisms
0.0    1533
2.0     644

In [17]:
def update_columns_labels(data, new_columns):
    updated_data = data.rename(index=str, columns=new_columns)
    return updated_data

In [18]:
new_columns = {
    "age in months at the time of the interview/test/sampling/imaging.": "age_months",
    "anxiety": "anxiety",
    "hand and finger and other complex mannerisms": "hand_finger_mannerisms",
    "imagination/creativity": "imagination_creativity",
    "immediate echolalia": "immediate_echolalia",
    "quality of social overtures": "quality_social_overtures",
    "self-injurious behavior": "self_injurious_behavior",
    "shared enjoyment in interaction": "shared_enjoyment_interaction",
    "tantrums, aggression, negative or disruptive behavior": "tantrums_aggression_disruptive_behavior",
    "unusual eye contact": "unusual_eye_contact",
    "diagnosis": "diagnosis",
    "isMale": "is_male"
}
update_columns_labels(data[potentiallyUseful], new_columns).to_csv('ados_datasets/finalDataV1.csv', sep=';', index=False)

In [177]:
data[potentiallyUseful].corr()

Unnamed: 0,age in months at the time of the interview/test/sampling/imaging.,anxiety,hand and finger and other complex mannerisms,imagination/creativity,immediate echolalia,quality of social overtures,self-injurious behavior,shared enjoyment in interaction,"tantrums, aggression, negative or disruptive behavior",unusual eye contact,diagnosis,isMale
age in months at the time of the interview/test/sampling/imaging.,1.0,-0.090634,0.169771,0.039551,0.014768,0.23648,0.140257,0.233195,-0.07496,0.251066,-0.043004,0.066696
anxiety,-0.090634,1.0,0.00206,0.105404,0.071787,0.030005,-0.00302,0.049894,0.062059,-0.008694,0.066537,-0.055797
hand and finger and other complex mannerisms,0.169771,0.00206,1.0,0.39622,0.179522,0.544921,0.190004,0.383812,0.11953,0.488809,0.28644,0.101577
imagination/creativity,0.039551,0.105404,0.39622,1.0,0.11738,0.531777,0.14877,0.465312,0.206934,0.433428,0.27263,0.160758
immediate echolalia,0.014768,0.071787,0.179522,0.11738,1.0,0.236666,-0.000703,0.12813,0.066119,0.258457,0.272843,0.049982
quality of social overtures,0.23648,0.030005,0.544921,0.531777,0.236666,1.0,0.209271,0.623203,0.25678,0.709134,0.532987,0.168133
self-injurious behavior,0.140257,-0.00302,0.190004,0.14877,-0.000703,0.209271,1.0,0.163843,0.149724,0.177077,0.06528,0.06217
shared enjoyment in interaction,0.233195,0.049894,0.383812,0.465312,0.12813,0.623203,0.163843,1.0,0.228244,0.524898,0.331309,0.131159
"tantrums, aggression, negative or disruptive behavior",-0.07496,0.062059,0.11953,0.206934,0.066119,0.25678,0.149724,0.228244,1.0,0.168595,0.181847,0.058802
unusual eye contact,0.251066,-0.008694,0.488809,0.433428,0.258457,0.709134,0.177077,0.524898,0.168595,1.0,0.559827,0.150613


In [178]:
data.describe()

Unnamed: 0,age in months at the time of the interview/test/sampling/imaging.,anxiety,collection_id,dataset_id,hand and finger and other complex mannerisms,imagination/creativity,immediate echolalia,quality of social overtures,self-injurious behavior,shared enjoyment in interaction,"tantrums, aggression, negative or disruptive behavior",unusual eye contact,diagnosis,isMale
count,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0,2642.0
mean,42.747918,0.195685,998.984103,10276.702498,0.663512,1.026874,0.466313,0.818319,0.057911,0.487131,0.35617,0.962907,0.342922,0.716503
std,25.140198,0.428019,1108.171422,1894.888643,0.843233,0.747859,0.676736,0.793955,0.275285,0.658749,0.61097,0.999501,0.474776,0.450781
min,7.0,0.0,9.0,8161.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,0.0,9.0,8161.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,36.0,0.0,19.0,11014.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,45.0,0.0,2368.0,12081.0,1.0,2.0,1.0,1.0,0.0,1.0,1.0,2.0,1.0,1.0
max,272.0,2.0,2382.0,13432.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0


In [165]:
data.columns

Index(['ados diagnosis classification',
       'age in months at the time of the interview/test/sampling/imaging.',
       'anxiety', 'collection_id', 'collection_title', 'dataset_id',
       'date on which the interview/genetic test/sampling/imaging/biospecimen was completed. mm/dd/yyyy',
       'hand and finger and other complex mannerisms',
       'imagination/creativity', 'immediate echolalia',
       'overall ados diagnosis', 'promoted_subjectkey',
       'quality of social overtures', 'self-injurious behavior',
       'sex of the subject', 'shared enjoyment in interaction',
       'subject id how it's defined in lab/project',
       'tantrums, aggression, negative or disruptive behavior',
       'the ndar global unique identifier (guid) for research subject',
       'unusual eye contact', 'diagnosis', 'isMale'],
      dtype='object')