#### Dataset - Autism-Child-Data.arff

In [None]:
# Initial Dataset Examination
#import packages
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
from scipy.io import arff
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Import arff file
data = arff.loadarff(r"C:\Users\Helen\Desktop\MoA\COMP809\Autism-Child-Data.arff")

#Convert to DataFrame
autism_df= pd.DataFrame(data[0])
autism_df.head()

In [None]:
#Change character encodings (to get rid of b(s))
def apply_decode(df_name):
    for col in df_name.columns:
        if df_name[col].dtype != 'float64':
            df_name[col] = df_name[col].apply(lambda val : val.decode('utf-8'))
    pd.set_option('display.max_columns',50)
    return df_name

autism = apply_decode(autism_df)

In [None]:
autism.head()

In [None]:
autism.info()

In [None]:
autism.shape

In [None]:
autism.columns

In [None]:
# Fix column name typos #
#'autism' column name is a typo, change it to 'autism'
autism = autism.rename(columns = {'austim':'autism'})

#'jundice' column name is a typo, change it to 'jaundice'
autism = autism.rename(columns = {'jundice':'jaundice'})

#'contry_of_res' column name is a typo, change it to 'country_of_res'
autism = autism.rename(columns = {'contry_of_res':'country_of_res'})

In [None]:
# Drop Columns #
#Drop 'relation' column - as it's not relevant for prediction
autism.drop('relation',axis=1, inplace=True)

#Drop 'used_app_before' column - as it's not relevant for prediction
autism.drop('used_app_before',axis=1, inplace=True)

#Drop 'age_desc' column - as it only contains one value describing the age range of child subjects
autism.drop('age_desc',axis=1, inplace=True)

#Drop 'result' column - as its a congregate colum of A1- A10 scores.
autism.drop('result',axis=1, inplace=True)

#Check columns after dropping
autism.columns

In [None]:
#Bar plot visualization of feature variables' relationship with target variable
fig,axarr = plt.subplots(2,3, figsize=(17,10), dpi=300, facecolor='w', edgecolor='k')
sns.set(style="white")
sns.countplot(x='age', hue = 'Class/ASD',data = autism, ax=axarr[0][0], palette="coolwarm")
axarr[0][0].set_title('Distribution of Age')
sns.countplot(x='gender', hue = 'Class/ASD',data = autism, ax=axarr[0][1], palette="coolwarm")
axarr[0][1].set_title('Distribution of Gender')
sns.countplot(x='ethnicity', hue = 'Class/ASD',data = autism,ax=axarr[0][2], palette="coolwarm")
axarr[0][2].set_title('Distribution of ethnicity')
sns.countplot(x='jaundice', hue = 'Class/ASD',data = autism, ax=axarr[1][0], palette="coolwarm")
axarr[1][0].set_title('Distribution of jaundice')
sns.countplot(x='autism', hue = 'Class/ASD',data = autism, ax=axarr[1][1], palette="coolwarm")
axarr[1][1].set_title('Distribution of autism')
sns.countplot(x='country_of_res', hue = 'Class/ASD',data = autism, ax=axarr[1][2], palette="coolwarm")
axarr[1][2].set_title('Distribution of country_of_res')
fig.suptitle('Distribution of Feature Variables vs Target Variable', fontsize=16);
plt.show()

In [None]:
#Explore whether there are null data
autism.isnull().sum()

In [None]:
#Expore 'Age' entries with null
autism[autism['age'].isnull()]

In [None]:
#Replace the 4 records in 'Age' with null values with '0'
autism['age']=autism['age'].fillna(value=0)
autism[autism['age'] == 0] #Check

In [None]:
#Check again for null values to see if we have indeed succeeded.
autism.isnull().sum()

In [None]:
#Convert 'Age' column's datatype from FLOAT to INT.
autism['age']=autism['age'].astype('int')
#Check it has been converted
autism['age'].dtype

In [None]:
#Convert all 'A()_Score' columns types to INT.
def scores(df_name, cols_lst):
    for col in cols_lst:
        df_name[col] = df_name[col].astype('int')
    return df_name

scores(autism,['A1_Score','A2_Score','A3_Score','A4_Score','A5_Score','A6_Score','A7_Score','A8_Score','A9_Score','A10_Score'])
autism['A10_Score'].dtype #check

In [None]:
autism['A1_Score'].dtype

In [None]:
#Convert 'gender' column - Change 'm'(male) to '1', 'f'(female) to '0'
autism['gender'].value_counts() #first check 'gender' values
autism['gender'] = autism['gender'].map({'m':1,'f':0})
autism['gender'].value_counts() #Check 'gender' values after replacement

In [None]:
#Convert 'jaundice' column - Change 'yes' to '1', 'no' to '0'
autism['jaundice'].value_counts() #first check 'jaundice' values
autism['jaundice'] = autism['jaundice'].map({'yes':1,'no':0})
autism['jaundice'].value_counts() #Check 'jaundice' values after replacement

In [None]:
#Convert 'Class/ASD' column - Change 'YES' to '1', 'NO' to '0'
autism['Class/ASD'].value_counts() #first check 'Class/ASD' values
autism['Class/ASD'] = autism['Class/ASD'].map({'YES':1,'NO':0})
autism['Class/ASD'].value_counts() #Check 'Class/ASD' values after replacement

In [None]:
#Convert 'autism' column - Change 'yes' to '1', 'no' to '0'
autism['autism'].value_counts() #first check 'autism' values
autism['autism'] = autism['autism'].map({'yes':1,'no':0})
autism['autism'].value_counts() #Check 'autism' values after replacement

In [None]:
#Check data types to make sure all necessary conversions has been done.
autism.info(verbose=True)

In [None]:
#Find Nulls recorded as '?'
autism.shape
autism.isin(['?']).sum()

In [None]:
# 43 records in 'ethnicity' identified- not suitable to drop, because it 43/292 entries, 
# dropping will tribute to large data loss. Instead replace '?' with 'unknown' as its own label to be encoded later
autism['ethnicity'].replace({"?": "Unknown"}, inplace=True)

In [None]:
# Encoding catergorical data to numeric with LabelEncoder
le = preprocessing.LabelEncoder()
CatCols = ['ethnicity', 'country_of_res']
for i in CatCols:
    autism[i] = le.fit_transform(autism[i].values)

In [None]:
autism.head()

End of Data pre-processing

#### Top 5 feature selection

In [None]:
#split dataset
X= autism.drop(['Class/ASD'], axis = 1) #feature variables
y= autism['Class/ASD']    #target variable

In [None]:
#Use SelectKBest to identify top 5 best features
KBest = SelectKBest(score_func=chi2, k=5)
fit = KBest.fit(X,y)
scores = pd.DataFrame(fit.scores_)
columns = pd.DataFrame(X.columns)

#Join results for visualization
featureScores = pd.concat([columns,scores],axis=1)
featureScores.columns = ['Feature','Score']
print(featureScores.nlargest(5,'Score').head(5))

In [None]:
#Use Correlation Matrix & Heatmap to verify top 5 best features
autism_corr= autism.corr()
plt.figure(figsize = (20,15))
cmap = sns.diverging_palette(240, 10, n=9,as_cmap=True)
sns.heatmap(autism_corr,annot=True)
plt.title('ASD Dataset Correlation Matrix Heatmap')
plt.show()

In [None]:
#See feature variable correlations with 'Class/ASD' in descending order
autism_corr_y =pd.DataFrame(autism_corr['Class/ASD'].drop('Class/ASD'))
autism_corr_final=abs(autism_corr_y.sort_values(by='Class/ASD', ascending = False))
print(autism_corr_final.nlargest(5,'Class/ASD').head(5))

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

#split dataset to perform RFE with logistic regression selection on A_scores only
X_rfe= autism.drop(['Class/ASD','age', 'ethnicity','country_of_res','gender','autism'], axis = 1) #feature variables
y_rfe= autism['Class/ASD']    #target variable

model = LogisticRegression()
rfe = RFE(model,n_features_to_select=5)
rfe_fit = rfe.fit(X_rfe,y_rfe)
print("Num Features: %s" % (rfe_fit.n_features_))
print("Selected Features: %s" % (rfe_fit.support_))
print("Feature Ranking: %s" % (rfe_fit.ranking_))

In [None]:
#Join results for visualization
scores = pd.DataFrame(rfe_fit.support_)
columns = pd.DataFrame(X_rfe.columns)

featureScores = pd.concat([columns,scores],axis=1)
featureScores.columns = ['Feature','Score']
print(featureScores.nlargest(5,'Score').head(5))

**Run the Naïve Bayes algorithm with the GaussianNB implementation for the selected features**

In [None]:
#Create Training and Testing data
#Set Features (based on best 5 correlations from previous step)
X= autism[['A4_Score','A9_Score','A8_Score','A1_Score','A10_Score']]

#Set Target
y= autism['Class/ASD']

#Prepare Training and Testing Data (20% test data)
X_train,X_test,y_train,y_test= train_test_split(X,y, test_size=0.2, random_state=42)

#No need to apply scaler since data are all 1 and 0

#Display Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

In [None]:
X.head()

In [None]:
#Build GaussianNB Model
gnb = GaussianNB() 
gnb.fit(X_train, np.ravel(y_train,order='C')) 
predictions = gnb.predict(X_test)

In [None]:
#Computer 10-fold Cross-Validation Score
cv = cross_val_score(gnb,X_test,y_test, cv=10)
print("Average 10-Fold CV Score - GaussianNB:{}".format(np.mean(cv)))

In [None]:
#Compute Accuracy Score
from sklearn.metrics import accuracy_score
print("Accuracy score of Gaussian Naive Bayes Model:", accuracy_score(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions, labels=[0, 1])
print(cm)

In [None]:
plot_confusion_matrix(gnb,X_test,y_test, normalize='all')
plt.show()

In [None]:
#Classification Report
report = classification_report(y_test, predictions, target_names=['0', '1'])
print(report)

In [None]:
#Run Decision Tree Classifer
#Use original data before feature selection
autism.head()

#Set Features
Xtree= autism.drop(['Class/ASD'],axis=1) 
#Set Target
ytree= autism['Class/ASD']

#Prepare Training and Testing Data (20% test data)
X_train,X_test,y_train,y_test= train_test_split(Xtree,ytree, test_size=0.2, random_state=42)


In [None]:
#Create and fit Decision Tree Classifier to data before feature selection
tree= DecisionTreeClassifier()
tree = tree.fit(X_train, y_train)
predictions= tree.predict(X_test)

In [None]:
#Plot Decision Tree Feature Importance
Feature_Importance=pd.Series(tree.feature_importances_,index=Xtree.columns)
Feature_Importance.sort_values(ascending=False, inplace=True)
Feature_Importance.plot.bar()
plt.title("Decision Tree Feature Importance by Rank")
plt.show()