# Final Project
#### CSCI 608
#### Jacob Dodson

In [116]:
import pandas as pd
import altair as alt
import numpy as np

### **<u>Problem Formulation:</u>**

#### ***1. Descriptive Question:*** What are the overall patterns of academic performance (G3) in Math and Portuguese, and how many students fall into “at-risk” performance categories (G3 < 10)?

#### ***2. Exploratory Question:*** Which external student factors, such as study time, family support, or alcohol use, show the strongest relationships with being academically at risk (G3 < 10)?

#### ***3. Predictive Question:*** Can we build a regression model that predicts whether a student is at risk of failing (G3 < 10) without using G1 or G2, based only on demographic, behavioral, and/or family features?

#### ***4. Inferential Question:*** Do students with strong family support, measured through high family relationship quality (famrel), parental cohabitation (Pstatus), and family educational support (famsup), have significantly higher final grades (G3) compared to students with weaker support, based on bootstrap confidence intervals?

### **<u>Data Collection:</u>**

#### Data was collected from this link: [Student Performance](https://archive.ics.uci.edu/dataset/320/student+performance) which is a Protuguese student performance dataset from students in secondary education (high school).

#### The datasets combines objective school records such as grades (G1, G2, G3) and absences, with a range of self-reported questionnaire data, including demographic, behavioral, family, educational support, and social/emotional variables.

### **<u>Data Preparation:</u>**

In [2]:
mat = pd.read_csv("student-mat.csv", sep=";")
por = pd.read_csv("student-por.csv", sep=";")

In [3]:
mat.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


#### Math dataset has 395 student performance observations

In [5]:
por.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


#### Portuguese dataset has 649 student performance observations

#### Both datasets are tidy. Each row corresponds to one student in a specific course and each column represents a single variable showing we have exactly one observation per row.

### **<u>Analysis:</u>**

#### ***1. Descriptive and Exploratory (data visualizations):***

In [6]:
mat_g3_mean = mat["G3"].mean()
por_g3_mean = por["G3"].mean()

In [7]:
mat_g3_median = mat["G3"].median()
por_g3_median = por["G3"].median()

In [8]:
mat_g3_counts = mat["G3"].value_counts().sort_index()
por_g3_counts = por["G3"].value_counts().sort_index()

In [22]:
mat_at_risk_count = (mat["G3"] < 10).sum()
por_at_risk_count = (por["G3"] < 10).sum()

In [23]:
mat_total = len(mat)
por_total = len(por)

In [24]:
mat_at_risk_pct = mat_at_risk_count / mat_total * 100
por_at_risk_pct = por_at_risk_count / por_total * 100

**Math (G3) Performance Stats**

In [25]:
print(f"Mean G3:, {mat_g3_mean:.2f}")

Mean G3:, 10.42


In [26]:
print(f"Median G3:, {mat_g3_median:.2f}")

Median G3:, 11.00


In [27]:
print("Distribution of G3")
mat_g3_counts

Distribution of G3


G3
0     38
4      1
5      7
6     15
7      9
8     32
9     28
10    56
11    47
12    31
13    31
14    27
15    33
16    16
17     6
18    12
19     5
20     1
Name: count, dtype: int64

In [30]:
print(f"At-risk math students (G3 < 10): {mat_at_risk_count}/{mat_total} " f"({mat_at_risk_pct:.1f}%)")

At-risk math students (G3 < 10): 130/395 (32.9%)


**Portuguese (G3) Performance Stats**

In [31]:
print(f"Mean G3:, {por_g3_mean:.2f}")

Mean G3:, 11.91


In [32]:
print(f"Median G3:, {por_g3_median:.2f}")

Median G3:, 12.00


In [33]:
print("Distribution of G3")
por_g3_counts

Distribution of G3


G3
0      15
1       1
5       1
6       3
7      10
8      35
9      35
10     97
11    104
12     72
13     82
14     63
15     49
16     36
17     29
18     15
19      2
Name: count, dtype: int64

In [76]:
print(f"At-risk Portuguese students (G3 < 10): {por_at_risk_count}/{por_total} " f"({por_at_risk_pct:.1f}%)")

At-risk Portuguese students (G3 < 10): 100/649 (15.4%)


In [77]:
mat['subject'] = 'Math'
por['subject'] = 'Portuguese'

In [104]:
df_plot = pd.concat([mat, por], ignore_index=True)

overlap_hist = alt.Chart(df_plot).mark_bar(opacity=0.5).encode(
    x=alt.X("G3:Q", bin=alt.Bin(maxbins=20), title="Final Grade (G3)"),
    y=alt.Y("count()", title="Count"),
    color=alt.Color("subject:N", title="Subject"),
    tooltip=["subject", "G3"]
).properties(
    width=800,
    height=600,
    title="Overlapped Distribution of Final Grades (G3): Math vs Portuguese"
)
cutoff_df = pd.DataFrame({"cutoff": [10]})

cutoff_line = alt.Chart(cutoff_df).mark_rule(
    color="black",
    strokeWidth=4,
    strokeDash=[6,4]
).encode(
    x="cutoff:Q"
)
hist_chart = overlap_hist + cutoff_line
hist_chart

In [105]:
df_ar = pd.concat([mat, por], ignore_index=True)
df_ar["at_risk"] = df_ar["G3"] < 10
df_ar["at_risk_int"] = df_ar["at_risk"].astype(int)

In [106]:
study_risk = alt.Chart(df_ar).transform_aggregate(
    at_risk_rate='mean(at_risk_int)',
    groupby=['studytime']
).mark_line().encode(
    x=alt.X('studytime:O', title='Study Time Level (1–4)'),
    y=alt.Y('at_risk_rate:Q', title='Proportion At-Risk', axis=alt.Axis(format='%')),
    tooltip=['studytime:O', alt.Tooltip('at_risk_rate:Q', format='.1%')]
).properties(
    width=600,
    height=400,
    title='Proportion of At-Risk Students by Study Time'
)
study_risk

In [107]:
study_time = alt.Chart(df_ar).transform_aggregate(
    at_risk_rate='mean(at_risk_int)',
    groupby=['studytime']
).mark_line(point=True).encode(
    x=alt.X('studytime:O', title='Study Time (1–5)'),
    y=alt.Y('at_risk_rate:Q', title='Proportion At-Risk', axis=alt.Axis(format='%')),
    tooltip=['studytime:O', alt.Tooltip('at_risk_rate:Q', format='.1%')]
).properties(
    width=600,
    height=400,
    title='Proportion At-Risk by Study Time'
)
study_time

In [108]:
famrel_risk = alt.Chart(df_ar).transform_aggregate(
    at_risk_rate='mean(at_risk_int)',
    groupby=['famrel']
).mark_line(point=True).encode(
    x=alt.X('famrel:O', title='Family Relationship Quality (1–5)'),
    y=alt.Y('at_risk_rate:Q', title='Proportion At-Risk', axis=alt.Axis(format='%')),
    tooltip=['famrel:O', alt.Tooltip('at_risk_rate:Q', format='.1%')]
).properties(
    width=600,
    height=400,
    title='Proportion At-Risk by Family Relationship Quality'
)
famrel_risk

In [109]:
dalc_risk = alt.Chart(df_ar).transform_aggregate(
    at_risk_rate='mean(at_risk_int)',
    groupby=['Dalc']
).mark_line(point=True).encode(
    x=alt.X('Dalc:O', title='Workday Alcohol Use (1–5)'),
    y=alt.Y('at_risk_rate:Q', title='Proportion At-Risk', axis=alt.Axis(format='%')),
    tooltip=['Dalc:O', alt.Tooltip('at_risk_rate:Q', format='.1%')]
).properties(
    width=600,
    height=400,
    title='Proportion At-Risk by Workday Alcohol Use'
)
dalc_risk

In [110]:
walc_risk = alt.Chart(df_ar).transform_aggregate(
    at_risk_rate='mean(at_risk_int)',
    groupby=['Walc']
).mark_line(point=True).encode(
    x=alt.X('Walc:O', title='Weekend Alcohol Use (1–5)'),
    y=alt.Y('at_risk_rate:Q', title='Proportion At-Risk', axis=alt.Axis(format='%')),
    tooltip=['Walc:O', alt.Tooltip('at_risk_rate:Q', format='.1%')]
).properties(
    width=600,
    height=400,
    title='Proportion At-Risk by Weekend Alcohol Use'
)
walc_risk

#### ***2. Predictive (Regression):***

#### ***3. Inferential (Bootstrap):***

In [153]:
def bootstrap_mean_diff(group1, group2, boot=5000, random_state=42):
    rng = np.random.default_rng(random_state)
    diffs = []

    for _ in range(boot):
        boot1 = rng.choice(group1, size=len(group1), replace=True)
        boot2 = rng.choice(group2, size=len(group2), replace=True)
        diffs.append(np.mean(boot1) - np.mean(boot2))

    lower, upper = np.percentile(diffs, [2.5, 97.5])
    return np.mean(diffs), lower, upper

In [154]:
df_inf = pd.concat([mat, por], ignore_index=True)

In [174]:
comparisons = {
    "Family Relationship Quality (famrel ≥ 4 vs ≤ 3)": {
        "strong": df_inf["famrel"] >= 4,
        "weak":   df_inf["famrel"] <= 3
    },
    "Parental Cohabitation (Together vs Apart)": {
        "strong": df_inf["Pstatus"] == "T",
        "weak":   df_inf["Pstatus"] == "A"
    },
    "Family Educational Support (yes vs no)": {
        "strong": df_inf["famsup"] == "yes",
        "weak":   df_inf["famsup"] == "no"
    },
    "Going Out (goout ≥ 4 vs ≤ 3)": {
        "strong": df_inf["goout"] >= 4,
        "weak":   df_inf["goout"] <= 3
    },
    "Health Status (health ≥ 4 vs ≤ 3)": {
        "strong": df_inf["health"] >= 4,
        "weak":   df_inf["health"] <= 3
    },
    "Internet Access (yes vs no)": {
        "strong": df_inf["internet"] == "yes",
        "weak":   df_inf["internet"] == "no"
    },
    "Attended Nursery School (yes vs no)": {
        "strong": df_inf["nursery"] == "yes",
        "weak":   df_inf["nursery"] == "no"
    },
    "Address Type (Urban vs Rural)": {
        "strong": df_inf["address"] == "U",
        "weak":   df_inf["address"] == "R"
    },
    "School (GP vs MS)": {
        "strong": df_inf["school"] == "GP",
        "weak":   df_inf["school"] == "MS"
    },
    "Extra Educational Support   (yes vs no)": {
        "strong": df_inf["schoolsup"] == "yes",
        "weak":   df_inf["schoolsup"] == "no"
    }
}

In [175]:
results = []
for label, rule in comparisons.items():
    strong_vals = df_inf.loc[rule["strong"], "G3"]
    weak_vals   = df_inf.loc[rule["weak"], "G3"]

    mean_diff, ci_low, ci_high = bootstrap_mean_diff(strong_vals, weak_vals)
    
    results.append({
        "Comparison": label,
        "Mean Difference (Strong - Weak)": round(mean_diff, 2),
        "95% CI Low": round(ci_low, 2),
        "95% CI High": round(ci_high, 2),
        "Significant?": "Yes" if ci_low > 0 or ci_high < 0 else "No"
    })

In [176]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Comparison,Mean Difference (Strong - Weak),95% CI Low,95% CI High,Significant?
0,Family Relationship Quality (famrel ≥ 4 vs ≤ 3),0.65,0.08,1.23,Yes
1,Parental Cohabitation (Together vs Apart),-0.37,-1.05,0.33,No
2,Family Educational Support (yes vs no),0.11,-0.38,0.6,No
3,Going Out (goout ≥ 4 vs ≤ 3),-0.94,-1.44,-0.45,Yes
4,Health Status (health ≥ 4 vs ≤ 3),-0.32,-0.8,0.15,No
5,Internet Access (yes vs no),1.02,0.44,1.62,Yes
6,Attended Nursery School (yes vs no),0.39,-0.19,0.98,No
7,Address Type (Urban vs Rural),1.02,0.49,1.56,Yes
8,School (GP vs MS),1.11,0.59,1.65,Yes
9,Extra Educational Support (yes vs no),-0.96,-1.51,-0.42,Yes
