# autism diagnosis prediction model

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

In [16]:
pd.set_option("display.max_columns", None)

In [17]:
data = pd.read_csv("../dataset/autism.csv")
data.head(10)

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,0,1,0,1,0,1,0,1,1,38.172746,f,?,no,no,Austria,no,6.351166,18 and more,Self,0
1,2,0,0,0,0,0,0,0,0,0,0,47.750517,m,?,no,no,India,no,2.255185,18 and more,Self,0
2,3,1,1,1,1,1,1,1,1,1,1,7.380373,m,White-European,no,yes,United States,no,14.851484,18 and more,Self,1
3,4,0,0,0,0,0,0,0,0,0,0,23.561927,f,?,no,no,United States,no,2.276617,18 and more,Self,0
4,5,0,0,0,0,0,0,0,0,0,0,43.20579,m,?,no,no,South Africa,no,-4.777286,18 and more,Self,0
5,6,1,0,0,0,0,1,0,0,1,1,31.527964,m,Middle Eastern,no,no,Jordan,no,9.562117,18 and more,Self,0
6,7,1,0,0,0,0,0,1,1,1,0,28.427971,f,Pasifika,no,no,United Kingdom,no,7.984569,18 and more,Self,0
7,8,1,1,1,1,1,1,1,0,1,1,26.484494,m,Black,no,yes,United States,no,13.237898,18 and more,Self,1
8,9,1,1,1,1,0,0,0,1,1,1,48.203459,m,White-European,no,no,Brazil,no,-1.755774,18 and more,Self,0
9,10,0,0,0,0,0,0,0,1,0,1,24.167945,f,Others,yes,no,New Zealand,no,14.92257,18 and more,Self,0


In [18]:
data.shape

(800, 22)

In [19]:
data.columns.tolist()

['ID',
 'A1_Score',
 'A2_Score',
 'A3_Score',
 'A4_Score',
 'A5_Score',
 'A6_Score',
 'A7_Score',
 'A8_Score',
 'A9_Score',
 'A10_Score',
 'age',
 'gender',
 'ethnicity',
 'jaundice',
 'austim',
 'contry_of_res',
 'used_app_before',
 'result',
 'age_desc',
 'relation',
 'Class/ASD']

In [20]:
data.isnull().sum()

ID                 0
A1_Score           0
A2_Score           0
A3_Score           0
A4_Score           0
A5_Score           0
A6_Score           0
A7_Score           0
A8_Score           0
A9_Score           0
A10_Score          0
age                0
gender             0
ethnicity          0
jaundice           0
austim             0
contry_of_res      0
used_app_before    0
result             0
age_desc           0
relation           0
Class/ASD          0
dtype: int64

In [21]:
data.duplicated().sum()

np.int64(0)

In [22]:
data.sample(5)

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
680,681,0,0,1,0,0,0,0,1,1,1,13.242807,m,?,no,no,Afghanistan,no,8.695166,18 and more,Self,0
628,629,0,0,0,0,0,0,0,1,0,0,11.346899,f,Asian,no,no,New Zealand,no,8.285317,18 and more,Self,0
147,148,0,1,0,0,0,0,0,0,1,0,19.453729,m,?,no,no,Serbia,no,-1.388358,18 and more,Self,0
228,229,1,1,1,0,1,0,1,1,1,1,17.512796,m,White-European,no,no,China,no,9.339451,18 and more,Self,0
495,496,1,0,1,1,0,0,0,1,1,1,22.468683,m,?,yes,no,India,no,4.130121,18 and more,Self,0


In [23]:
data["Class/ASD"].value_counts()

Class/ASD
0    639
1    161
Name: count, dtype: int64

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               800 non-null    int64  
 1   A1_Score         800 non-null    int64  
 2   A2_Score         800 non-null    int64  
 3   A3_Score         800 non-null    int64  
 4   A4_Score         800 non-null    int64  
 5   A5_Score         800 non-null    int64  
 6   A6_Score         800 non-null    int64  
 7   A7_Score         800 non-null    int64  
 8   A8_Score         800 non-null    int64  
 9   A9_Score         800 non-null    int64  
 10  A10_Score        800 non-null    int64  
 11  age              800 non-null    float64
 12  gender           800 non-null    object 
 13  ethnicity        800 non-null    object 
 14  jaundice         800 non-null    object 
 15  austim           800 non-null    object 
 16  contry_of_res    800 non-null    object 
 17  used_app_before 

In [25]:
data.describe()

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,result,Class/ASD
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,400.5,0.56,0.53,0.45,0.415,0.395,0.30375,0.3975,0.50875,0.495,0.6175,28.452118,8.537303,0.20125
std,231.0844,0.496697,0.499411,0.497805,0.49303,0.489157,0.460164,0.489687,0.500236,0.500288,0.486302,16.310966,4.807676,0.401185
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.71855,-6.137748,0.0
25%,200.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.198153,5.306575,0.0
50%,400.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,24.84835,9.605299,0.0
75%,600.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,35.865429,12.514484,0.0
max,800.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,89.461718,15.853126,1.0


In [26]:
# converting age from float to int
data["age"] = data["age"].astype(int)
data.sample(5)

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
80,81,0,1,0,0,0,0,0,0,0,1,49,f,Middle Eastern,no,no,India,no,1.832433,18 and more,Self,0
474,475,1,0,0,0,0,0,1,0,1,0,54,m,?,no,no,Afghanistan,no,4.589913,18 and more,Self,0
409,410,0,0,0,0,0,0,0,1,0,0,38,f,South Asian,no,no,United Arab Emirates,no,1.464522,18 and more,?,0
378,379,1,0,0,0,1,0,1,1,0,1,21,m,White-European,no,no,Spain,no,5.309044,18 and more,?,0
259,260,1,0,0,0,0,0,0,0,0,1,33,f,Middle Eastern,yes,no,India,no,7.352908,18 and more,Self,0


In [27]:
for column in data.columns:
    numerical_features = ["ID", "age", "result"]
    if column not in numerical_features:
        print(column, data[column].unique())
        print("-"*50)

A1_Score [1 0]
--------------------------------------------------
A2_Score [0 1]
--------------------------------------------------
A3_Score [1 0]
--------------------------------------------------
A4_Score [0 1]
--------------------------------------------------
A5_Score [1 0]
--------------------------------------------------
A6_Score [0 1]
--------------------------------------------------
A7_Score [1 0]
--------------------------------------------------
A8_Score [0 1]
--------------------------------------------------
A9_Score [1 0]
--------------------------------------------------
A10_Score [1 0]
--------------------------------------------------
gender ['f' 'm']
--------------------------------------------------
ethnicity ['?' 'White-European' 'Middle Eastern ' 'Pasifika' 'Black' 'Others'
 'Hispanic' 'Asian' 'Turkish' 'South Asian' 'Latino' 'others']
--------------------------------------------------
jaundice ['no' 'yes']
--------------------------------------------------
austim

In [28]:
data.sample(5)

Unnamed: 0,ID,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
197,198,0,0,0,0,0,0,0,1,0,0,26,f,?,no,no,Jordan,no,6.00926,18 and more,Parent,0
316,317,0,0,0,0,0,0,0,0,0,0,28,m,?,no,no,France,no,5.021849,18 and more,Self,0
144,145,0,0,0,0,0,0,0,0,0,0,21,m,Middle Eastern,no,no,Australia,no,8.17246,18 and more,Self,0
96,97,0,0,0,0,0,0,0,0,0,0,26,m,?,no,no,Ireland,no,10.421883,18 and more,Self,0
244,245,1,0,0,0,0,0,1,0,0,0,48,m,Middle Eastern,no,no,Spain,no,3.778064,18 and more,Relative,0


In [29]:
data = data.drop(columns=["ID", "age_desc"])
data.shape

(800, 20)

In [30]:
data.sample(5)

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,contry_of_res,used_app_before,result,relation,Class/ASD
525,1,0,0,1,0,0,1,0,1,1,23,m,Middle Eastern,no,no,Australia,no,-0.148231,Self,0
128,1,0,0,0,0,0,0,0,0,0,8,m,Asian,no,no,New Zealand,no,11.514359,Self,0
293,1,1,0,0,0,0,0,1,0,1,10,f,Latino,no,no,Japan,no,11.508588,Self,0
343,1,1,0,0,0,0,0,0,1,1,28,f,Middle Eastern,no,yes,India,no,0.977525,Self,0
252,0,1,1,0,0,0,0,0,0,1,56,m,?,no,no,Afghanistan,no,9.631875,Self,0
