# MACHINE LEARNING PROJECT

**Team**: Jonathan Manzano & Henry Pham

**Dataset**: Autism in children

In [50]:
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go

In [51]:
# Load the dataset if pull from GitHub
# def get_data():
#     data="https://raw.githubusercontent.com/csbfx/cs133/main/autism_child.csv"
#     df = pd.read_csv(data, sep=',', na_values = '?')
#     return df
#
# autism_df = get_data()

In [52]:
# Set the path to the local file
autism_relative_path = "./data/autism_child.csv"
autism_full_path = os.path.join(os.getcwd(), autism_relative_path)

# Load the dataset from local file
autism_df = pd.read_csv(autism_full_path, sep=",", na_values="?")
autism_df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,country_of_res,used_app_before,total_score,age_desc,relation,ASD
0,1,1,0,0,1,1,0,1,0,0,...,m,Others,no,no,Jordan,no,5,4-11 years,Parent,NO
1,1,1,0,0,1,1,0,1,0,0,...,m,Middle Eastern,no,no,Jordan,no,5,4-11 years,Parent,NO
2,1,1,0,0,0,1,1,1,0,0,...,m,,no,no,Jordan,yes,5,4-11 years,,NO
3,0,1,0,0,1,1,0,0,0,1,...,f,,yes,no,Jordan,no,4,4-11 years,,NO
4,1,1,1,1,1,1,1,1,1,1,...,m,Others,yes,no,United States,no,10,4-11 years,Parent,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,1,1,1,1,1,1,1,1,1,1,...,f,White-European,yes,yes,United Kingdom,no,10,4-11 years,Parent,YES
288,1,0,0,0,1,0,1,0,0,1,...,f,White-European,yes,yes,Australia,no,4,4-11 years,Parent,NO
289,1,0,1,1,1,1,1,0,0,1,...,m,Latino,no,no,Brazil,no,7,4-11 years,Parent,YES
290,1,1,1,0,1,1,1,1,1,1,...,m,South Asian,no,no,India,no,9,4-11 years,Parent,YES


In [63]:
# Check the datatypes
autism_df.dtypes

A1_Score            int64
A2_Score            int64
A3_Score            int64
A4_Score            int64
A5_Score            int64
A6_Score            int64
A7_Score            int64
A8_Score            int64
A9_Score            int64
A10_Score           int64
age                 int64
gender             object
ethnicity          object
jundice            object
austim             object
country_of_res     object
used_app_before    object
total_score         int64
age_desc           object
relation           object
ASD                object
dtype: object

In [64]:
# Check for missing values
autism_df.isnull().sum()

A1_Score            0
A2_Score            0
A3_Score            0
A4_Score            0
A5_Score            0
A6_Score            0
A7_Score            0
A8_Score            0
A9_Score            0
A10_Score           0
age                 0
gender              0
ethnicity          43
jundice             0
austim              0
country_of_res      0
used_app_before     0
total_score         0
age_desc            0
relation           43
ASD                 0
dtype: int64

In [65]:
# Calculate the mean for the age column
age_autism_mean = autism_df["age"].dropna().mean().astype("int64")

# Change the datatype of the age column to integer
autism_df["age"] = autism_df["age"].fillna(age_autism_mean).astype("int64")
autism_df["age"].dtypes

dtype('int64')

In [56]:
# Check age for missing values
autism_df["age"].isnull().sum()

0

In [57]:
autism_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   A1_Score         292 non-null    int64 
 1   A2_Score         292 non-null    int64 
 2   A3_Score         292 non-null    int64 
 3   A4_Score         292 non-null    int64 
 4   A5_Score         292 non-null    int64 
 5   A6_Score         292 non-null    int64 
 6   A7_Score         292 non-null    int64 
 7   A8_Score         292 non-null    int64 
 8   A9_Score         292 non-null    int64 
 9   A10_Score        292 non-null    int64 
 10  age              292 non-null    int64 
 11  gender           292 non-null    object
 12  ethnicity        249 non-null    object
 13  jundice          292 non-null    object
 14  austim           292 non-null    object
 15  country_of_res   292 non-null    object
 16  used_app_before  292 non-null    object
 17  total_score      292 non-null    in

In [58]:
autism_df.describe()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,total_score
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,0.633562,0.534247,0.743151,0.55137,0.743151,0.712329,0.606164,0.496575,0.493151,0.726027,6.349315,6.239726
std,0.482658,0.499682,0.437646,0.498208,0.437646,0.453454,0.489438,0.500847,0.500811,0.446761,2.349504,2.284882
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,6.0,6.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,8.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.0,10.0


In [59]:
autism_df.shape

(292, 21)

In [24]:
# Looking at relation column to understand it better
autism_df["relation"].value_counts(dropna=False)

# Weird has to 2 self values
# This feature seems fairly useless, so I will either drop it or convert NaN to Parent

relation
Parent                      214
NaN                          43
Relative                     17
Health care professional     13
Self                          4
self                          1
Name: count, dtype: int64

In [7]:
# looking at ethnicity column to understand it better
autism_df["ethnicity"].value_counts(dropna=False)
# honestly don't know what to do with NaN values, wil probably figure it out later

ethnicity
White-European     108
Asian               46
NaN                 43
Middle Eastern      27
South Asian         21
Others              14
Black               14
Latino               8
Hispanic             7
Pasifika             2
Turkish              2
Name: count, dtype: int64

## What is the distribution of ASD cases across different countries?

In [42]:
# Preparing the data for the choropleth map
# Counting the number of ASD cases per country
autism_df["ASD"] = autism_df["ASD"].str.upper()  # Ensure consistency in ASD column
as_cases = (
    autism_df[autism_df["ASD"] == "YES"]
    .groupby("country_of_res")
    .size()
    .reset_index(name="count")
)

# Create the choropleth map
fig = px.choropleth(
    as_cases,
    locations="country_of_res",
    locationmode="country names",
    color="count",
    color_continuous_scale="Viridis",
    labels={"count": "Number of ASD Cases"},
    title="Distribution of ASD Cases Across Different Countries",
)

# Show the figure
fig.show()

## How does the total ASD screening score vary by gender?

In [40]:
# Create a box plot to visualize the distribution of total scores by gender
fig = px.box(
    autism_df,
    x="gender",
    y="total_score",
    points="all",  # This option shows all points (outliers)
    title="Distribution of Total ASD Screening Scores by Gender",
    labels={"total_score": "Total ASD Screening Score", "gender": "Gender"},
    color="gender",
)  # Color by gender to distinguish easily

fig.update_layout(
    xaxis_title="Gender", yaxis_title="Total ASD Screening Score", legend_title="Gender"
)
fig.show()

## Is there a relationship between family history of autism and the ASD screening outcomes?

In [41]:
# Create a new column for simplicity in plotting
autism_df["family_history"] = autism_df["austim"].apply(
    lambda x: "With Family History" if x == "yes" else "No Family History"
)

# Create a stacked bar chart to visualize the relationship between family history and ASD outcomes
fig = px.bar(
    autism_df,
    x="family_history",
    color="ASD",
    title="Relationship Between Family History of Autism and ASD Outcomes",
    labels={
        "family_history": "Family History of Autism",
        "ASD": "ASD Screening Outcome",
    },
    barmode="stack",
)

fig.update_layout(
    xaxis_title="Family History of Autism",
    yaxis_title="Count of Screening Outcomes",
    legend_title="ASD Outcome",
)
fig.show()

## How does the presence of jaundice at birth relate to ASD diagnosis?

In [42]:
# Create a grouped bar chart to visualize the relationship between jaundice and ASD outcomes
fig = px.bar(
    autism_df,
    x="jundice",
    color="ASD",
    barmode="group",
    title="Relationship Between Jaundice at Birth and ASD Diagnosis",
    labels={"jundice": "Jaundice at Birth", "ASD": "ASD Diagnosis Outcome"},
    category_orders={"jundice": ["yes", "no"], "ASD": ["YES", "NO"]},
)  # Ensure consistent order

fig.update_layout(
    xaxis_title="Jaundice at Birth",
    yaxis_title="Count of ASD Outcomes",
    legend_title="ASD Diagnosis Outcome",
)
fig.show()

## What are the common traits (based on A1-A10 scores) among children diagnosed with ASD compared to those not diagnosed?

In [43]:
# Calculate the mean scores for each question by ASD diagnosis
mean_scores = (
    autism_df.groupby("ASD")[
        [
            "A1_Score",
            "A2_Score",
            "A3_Score",
            "A4_Score",
            "A5_Score",
            "A6_Score",
            "A7_Score",
            "A8_Score",
            "A9_Score",
            "A10_Score",
        ]
    ]
    .mean()
    .reset_index()
)

# Prepare data for plotting
melted_data = mean_scores.melt(
    id_vars="ASD", var_name="Question", value_name="Average Score"
)

# Create a grouped bar chart
fig = px.bar(
    melted_data,
    x="Question",
    y="Average Score",
    color="ASD",
    barmode="group",
    title="Common Traits Based on A1-A10 Scores Among ASD Diagnosed and Non-Diagnosed Children",
    labels={"Question": "Question (A1-A10)", "Average Score": "Average Score"},
)

fig.update_layout(
    xaxis_title="Questions (A1 to A10)",
    yaxis_title="Average Score",
    legend_title="ASD Diagnosis",
)
fig.show()

## What is the distribution of the scores based on the age of the individuals who has or has not used an app before?

In [68]:
# Create a line plot
# Need to install statsmodels to use the trendline="ols" option
# pip install statsmodels
fig = px.scatter(autism_df, x="age", y="total_score", facet_col="used_app_before", color="gender", trendline="ols")
fig.show()

## What is the probability of each question being answered 'Yes' among individuals diagnosed with ASD?

In [48]:
# Filter data for individuals diagnosed with ASD
asd_yes = autism_df[autism_df["ASD"] == "YES"]

# Calculate the percentage of 'Yes' responses for each question among ASD diagnosed individuals
question_columns = [col for col in autism_df.columns if "Score" in col]
percentages = [(asd_yes[col].sum() / len(asd_yes)) * 100 for col in question_columns]

# Format percentages to two significant figures
formatted_percentages = ["{:.2f}%".format(p) for p in percentages]

# Create a bar chart to visualize this data
fig_bar = go.Figure(
    [
        go.Bar(
            x=question_columns,
            y=percentages,
            text=formatted_percentages,
            textposition="auto",
        )
    ]
)
fig_bar.update_layout(
    title_text='Percentage of "Yes" Responses for Each Question Among ASD Diagnosed Individuals',
    xaxis_title="Question",
    yaxis_title="Percentage of 'Yes' Responses (%)",
    yaxis=dict(range=[0, 100]),
)

# Show the figure
fig_bar.show()