# CS 133 PROJECT: MACHINE LEARNING MODEL
--------
### Team: Jonathan Manzano & Henry Pham  
Dataset: Autism in children  

[link to Proj Description](https://sjsu.instructure.com/courses/1580083/files/76444276?module_item_id=15357781)

Notes: 
1. Problem type: 
    - Classification

2. Testing Models:
    - Random Forest Classifier
    - Logistic Regression
    - Stochastic Gradient Descent
    - Support Vector Classifier

3. Conclusions:
    - Confusion Matrix
    - Best Testing Metric


In [3]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#import sklean
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [8]:
#read the data
def get_data():
    data="https://raw.githubusercontent.com/csbfx/cs133/main/autism_child.csv"
    df = pd.read_csv(data, sep=',', na_values = '?')
    return df

df = get_data()
df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,country_of_res,used_app_before,total_score,age_desc,relation,ASD
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,Middle Eastern,no,no,Jordan,no,5,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,,no,no,Jordan,yes,5,4-11 years,,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,,yes,no,Jordan,no,4,4-11 years,,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,United States,no,10,4-11 years,Parent,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,1,1,1,1,1,1,1,1,1,1,...,f,White-European,yes,yes,United Kingdom,no,10,4-11 years,Parent,YES
288,1,0,0,0,1,0,1,0,0,1,...,f,White-European,yes,yes,Australia,no,4,4-11 years,Parent,NO
289,1,0,1,1,1,1,1,0,0,1,...,m,Latino,no,no,Brazil,no,7,4-11 years,Parent,YES
290,1,1,1,0,1,1,1,1,1,1,...,m,South Asian,no,no,India,no,9,4-11 years,Parent,YES


In [9]:
# data preprocessing using StandardScaler only

# Handling Missing Values
df['age'].fillna(df['age'].median(), inplace=True)
df['ethnicity'].fillna('Unknown', inplace=True)
df['relation'].fillna('Unknown', inplace=True)

# Encoding Categorical Variables
df_encoded = pd.get_dummies(df, columns=['gender', 'jundice', 'austim', 'country_of_res', 'used_app_before', 'age_desc', 'ASD'])

# Scaling Numerical Variables
scaler = StandardScaler()
numerical_cols = ['age', 'total_score']
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

df_encoded;


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ethnicity'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [10]:
# get the numerical columns & categorical columns
num_attribs = [cols for cols in df.columns if df[cols].dtype != "object"]
cat_attribs = [cols for cols in df.columns if df[cols].dtype == "object"]

#fill any missing values in numerical columns with median value
imputer = SimpleImputer(strategy="median")
num_prepped = df[num_attribs]
imputer.fit(num_prepped)

# make object/string/categorical columns machine learning model readable
cat_prepped = df[cat_attribs]
cat_encoder = OneHotEncoder()
hot_cat_prepped = cat_encoder.fit_transform(cat_prepped)

#i think this dataset would benefit from having combined attributes, my brain is too much rn so will work on it later
# make a combined attributes later

num_pipline = Pipeline(
    [
    ('imputer', SimpleImputer(strategy = "medium")),
    ('attribs_adder', CombinedAttributesAdder()),
    ("std_scaler", StandardSCaler()),
    ]
)





In [None]:
# # get the numerical columns & cat columns
# num_attribs = [cols for cols in df.columns if df[cols].dtype != "object"]
# cat_attribs = [cols for cols in df.columns if df[cols].dtype == "object"]

# imputer = SimpleImputer(strategy="median")
# num_df = df[num_attribs]
# imputer.fit(num_df)

# cat_df = df[cat_attribs]
# cat_encoder = OneHotEncoder()
# hot_cat_df = cat_encoder.fit_transform(cat_df)

# # create pipeline for numerical data
# num_pipeline = Pipeline(
#     [
#         ("imputer", SimpleImputer(strategy="median")),
#         # ('attribs_adder', CombinedAttributesAdder()),  # lmao i have no idea whats going on in that part of the lecture
#         ("std_scaler", StandardScaler()),
#     ]
# )

# # fit and transform the numerical data
# tr_num_df = num_pipeline.fit_transform(num_df) # <-- num_df from imputer cell

# print(num_attribs) # numerical columns
# print(cat_attribs) # categorical columns

# # create a full pipeline
# full_pipeline = ColumnTransformer([
#         ("nums", num_pipeline, num_attribs),
#         ("cat", OneHotEncoder(), cat_attribs)
#     ])

# # fit and transform the data
# df_prepared = full_pipeline.fit_transform(df)

# df_prepared.shape