In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from scipy import stats

# Preprocessing

In [2]:
# Read in dataset.
dataset = pd.read_csv('allSurveys-7-18-22-NMV-class-ASD-only-newSample.csv')

# Read in question mapping.
question_mapping_df = pd.read_csv('allSurveys-7-18-22-NMV-class-ASD-only-mapping.csv')

In [3]:
# Convert question mapping to dict.

question_mapping = {}
for index, row in question_mapping_df.iterrows():
  question_mapping['f' + str(row['fNumber'])] = str(row['fText'])

In [4]:
# Make sure dataset is balanced.
dataset['class'].value_counts()

0    265
1    109
Name: class, dtype: int64

In [5]:
# Basic preprocessing (convert everything into numbers).
processed_dataset = dataset.replace(to_replace=['male', 'female', 'other', 'yes', 'no'], value=[0, 1, -1, 1, 0])
processed_dataset

Unnamed: 0,childId,class,f1,f2,f3,f4,f5,f6,f7,f8,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,40,0,13,2,3,3,0,5,3,2,...,4,4,4,4,3,2,2,5,4,5
1,58,0,19,2,3,4,4,5,4,5,...,4,5,4,5,3,2,3,4,4,4
2,59,1,17,5,5,3,4,5,2,2,...,3,5,5,3,2,2,2,2,2,2
3,60,0,9,4,3,4,4,4,3,3,...,5,5,5,5,5,4,4,4,5,4
4,124,0,17,3,5,2,2,2,5,3,...,2,0,4,5,3,4,5,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,8156,0,8,2,2,2,3,3,3,2,...,5,4,5,5,5,4,4,5,4,4
370,8157,0,3,4,4,4,4,5,4,3,...,5,5,5,5,5,1,2,5,3,5
371,8158,0,3,1,3,3,3,3,1,1,...,3,5,5,4,1,2,2,3,2,3
372,8159,0,17,3,3,3,2,3,3,3,...,1,4,3,1,1,3,2,2,2,3


In [6]:
# Balance dataset.

from sklearn.utils import resample

df_majority = processed_dataset[processed_dataset['class'] == 0]
df_minority = processed_dataset[processed_dataset['class'] == 1]

print(df_majority)

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=265,    # to match majority class
                                 random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_upsampled['class'].value_counts()

     childId  class  f1  f2  f3  f4  f5  f6  f7  f8  ...  f109  f110  f111  \
0         40      0  13   2   3   3   0   5   3   2  ...     4     4     4   
1         58      0  19   2   3   4   4   5   4   5  ...     4     5     4   
3         60      0   9   4   3   4   4   4   3   3  ...     5     5     5   
4        124      0  17   3   5   2   2   2   5   3  ...     2     0     4   
5        128      0   7   3   2   3   2   3   3   4  ...     5     5     4   
..       ...    ...  ..  ..  ..  ..  ..  ..  ..  ..  ...   ...   ...   ...   
368     8149      0   9   4   4   3   3   2   4   3  ...     5     5     5   
369     8156      0   8   2   2   2   3   3   3   2  ...     5     4     5   
370     8157      0   3   4   4   4   4   5   4   3  ...     5     5     5   
371     8158      0   3   1   3   3   3   3   1   1  ...     3     5     5   
372     8159      0  17   3   3   3   2   3   3   3  ...     1     4     3   

     f112  f113  f114  f115  f116  f117  f118  
0       4     3

0    265
1    265
Name: class, dtype: int64

In [7]:
# Get X and y matrices for machine learning.

X_column_names = processed_dataset.columns.to_list()
X_column_names.remove('class')
X_column_names.remove('childId')
X = processed_dataset[X_column_names].values

y = processed_dataset['class'].values

In [8]:
# Baseline SVM classifier.

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
      
clf = SVC()
cv_results = cross_validate(clf, X, y, cv=3, scoring=scoring)

print('Accuracy: ', stats.describe(cv_results['test_accuracy']))
print('Precision: ', stats.describe(cv_results['test_precision']))
print('Recall: ', stats.describe(cv_results['test_recall']))
print('F1 Score: ', stats.describe(cv_results['test_f1_score']))

Accuracy:  DescribeResult(nobs=3, minmax=(0.72, 0.7338709677419355), mean=0.7272903225806452, variance=4.847866805411058e-05, skewness=-0.18530497351766292, kurtosis=-1.499999999999995)
Precision:  DescribeResult(nobs=3, minmax=(0.75, 1.0), mean=0.8333333333333334, variance=0.020833333333333336, skewness=0.7071067811865465, kurtosis=-1.500000000000001)
Recall:  DescribeResult(nobs=3, minmax=(0.08108108108108109, 0.08333333333333333), mean=0.08258258258258257, variance=1.6908800692584338e-06, skewness=-0.7071067811865084, kurtosis=-1.5000000000000366)
F1 Score:  DescribeResult(nobs=3, minmax=(0.14634146341463414, 0.15384615384615385), mean=0.15006253908692932, variance=1.4083027971281365e-05, skewness=0.03060693032775119, kurtosis=-1.4999999999999996)


# Tree-based Feature Selection and Predictive Model

In [9]:
# Tree-based feature selection.

clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
feature_importances = clf.feature_importances_  

# Sort by most important features.
feature_and_importances = [(X_column_names[i], feature_importances[i]) for i in range(len(X_column_names))]
sorted_features_and_importances = sorted(feature_and_importances, key=lambda x: x[1], reverse=True)

# Print out most important features.
for x in sorted_features_and_importances:
  if x[0] in question_mapping:
    print(x[1], '=', question_mapping[x[0]])
  else:
    print(x[1], '=', x[0])

# print(sorted_features_and_importances)

0.020742335106266937 = SCQ=Has childName ever had any objects (other than a soft toy or comfort blanket) that she/he had to carry around?
0.017307097521120245 = SSP=Has a weak grasp
0.017179254828104833 = SSP=Poor endurance/tires easily
0.017025496941819748 = SSP=Can't work with background noise (for example, fan, refrigerator)
0.01609955890713636 = SSP=Seems to have weak muscles
0.015374577692467244 = SCQ=Has childName ever got her/his pronouns mixed up (e.g., saying you or she/he for I)?
0.012565472091865672 = SSP=Can't lift heavy objects (for example, weak in comparison to same age children)
0.012241789838506014 = ASDQ=Become preoccupied with visual patterns or sounds? (for example, fascination with the way something looks or moves, excessive focus on certain sounds or videos)
0.012021724754788752 = SSP=Covers eyes or squints to protect eyes from light
0.011858957357147926 = SSP=Will only eat certain tastes
0.01180370870782059 = ASDQ=Flap or move their hands in an unusual way? (for 

In [10]:
# Train SVM with top-5 features.
# sorted_features_and_importances
# X = processed_dataset[['q33', 'q11', 'q21', 'q9', 'q4']].values
# y = processed_dataset['ASD'].values

# scoring = {'accuracy' : make_scorer(accuracy_score), 
#            'precision' : make_scorer(precision_score),
#            'recall' : make_scorer(recall_score), 
#            'f1_score' : make_scorer(f1_score)}
      
# clf = SVC()
# cv_results = cross_validate(clf, X, y, cv=3, scoring=scoring)

# print('Accuracy: ', stats.describe(cv_results['test_accuracy']))
# print('Precision: ', stats.describe(cv_results['test_precision']))
# print('Recall: ', stats.describe(cv_results['test_recall']))
# print('F1 Score: ', stats.describe(cv_results['test_f1_score']))

# Recursive Feature Elimination-based Feature Selection and Predictive Model

In [11]:
# RFE-based feature selection.

estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
selector.support_, selector.ranking_

# Get selected features.
selected_features = []
for i in range(len(selector.support_)):
  if selector.support_[i] == True:
    selected_features.append(X_column_names[i])

# Print out most important features.
for feat in selected_features:
  if feat in question_mapping:
    print(question_mapping[feat])
  else:
    print(feat)

selected_features

ASDQ=Communicate clearly so that other people know how they feel?
SCQ=Has childName ever got her/his pronouns mixed up (e.g., saying you or she/he for I)?
SCQ=Has childName ever had any objects (other than a soft toy or comfort blanket) that she/he had to carry around?
SCQ=When she/he was 4 to 5, did childName ever spontaneously point at things around her/him just to show you things (not because she/he wanted them)?
SCQ=When she/he was 4 to 5, did childName ever offer to share things other than food with you?


['f7', 'f45', 'f58', 'f62', 'f69']

In [12]:
# Train SVM with top-5 features.
# sorted_features_and_importances
# X = processed_dataset[['q9', 'q11', 'q19', 'q22', 'q33']].values
# y = processed_dataset['ASD'].values

# scoring = {'accuracy' : make_scorer(accuracy_score), 
#            'precision' : make_scorer(precision_score),
#            'recall' : make_scorer(recall_score), 
#            'f1_score' : make_scorer(f1_score)}
      
# clf = SVC()
# cv_results = cross_validate(clf, X, y, cv=3, scoring=scoring)

# print('Accuracy: ', stats.describe(cv_results['test_accuracy']))
# print('Precision: ', stats.describe(cv_results['test_precision']))
# print('Recall: ', stats.describe(cv_results['test_recall']))
# print('F1 Score: ', stats.describe(cv_results['test_f1_score']))