### Import Libraries

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

#Set the plotting style.
sns.set_style("whitegrid")

In [3]:
### Importing Libraries for Statistical Modeling and Evaluation

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from statsmodels.stats.anova import anova_lm

### Load the dataset

In [11]:
student_data = pd.read_excel('/Users/hamdahassan/DATA5100-group-project/data/Student_Performance_Data(SPD24).xlsx')

### Explore the contents of the dataset
- Checking for nulls, duplicates, etc.
- Checkin for extreme outliers. None exist, which means there is no need to drop or escale anything.

In [12]:
student_data.head()

Unnamed: 0,Student ID,Gender,Age,Grade Level,Attendance Rate,Study Hours,Parental Education Level,Parental Involvement,Extracurricular Activities,Socioeconomic Status,...,Bullying Incidents,Special Education Services,Counseling Services,Learning Disabilities,Behavioral Issues,Attendance of Tutoring Sessions,School Climate,Parental Employment Status,Household Size,Performance Score
0,1,Male,15,12,80.4878,2.764496,Bachelor's,High,Yes,High,...,1,No,Yes,Yes,Yes,No,Negative,Unemployed,3,Low
1,2,Female,17,12,96.242678,4.534785,Bachelor's,High,Yes,Low,...,3,Yes,Yes,No,Yes,No,Negative,Employed,3,Medium
2,3,Male,14,9,84.649681,2.008148,Bachelor's,Medium,Yes,Low,...,0,Yes,No,Yes,No,No,Neutral,Employed,3,High
3,4,Male,14,10,86.158599,3.698293,High School,High,No,Low,...,3,Yes,No,No,Yes,No,Positive,Employed,4,Medium
4,5,Male,15,10,88.487638,3.408604,Associate,Low,No,Middle,...,1,Yes,Yes,No,No,No,Positive,Employed,6,Low


### About the dataset
- We don't have any missing values in any of the rows.
- Data includes both numerical and categorical variables

In [13]:
student_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98000 entries, 0 to 97999
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Student ID                       98000 non-null  int64  
 1   Gender                           98000 non-null  object 
 2   Age                              98000 non-null  int64  
 3   Grade Level                      98000 non-null  int64  
 4   Attendance Rate                  98000 non-null  float64
 5   Study Hours                      98000 non-null  float64
 6   Parental Education Level         98000 non-null  object 
 7   Parental Involvement             98000 non-null  object 
 8   Extracurricular Activities       98000 non-null  object 
 9   Socioeconomic Status             98000 non-null  object 
 10  Previous Academic Performance    98000 non-null  float64
 11  Class Participation              98000 non-null  object 
 12  Health Status     

In [18]:
student_data.duplicated().sum()

np.int64(0)

In [20]:
student_data.describe()

Unnamed: 0,Student ID,Age,Grade Level,Attendance Rate,Study Hours,Previous Academic Performance,Hours of Sleep,Homework Completion Rate,Reading Proficiency,Math Proficiency,Science Proficiency,Language Proficiency,Screen Time,Bullying Incidents,Household Size
count,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0,98000.0
mean,49000.5,16.001949,10.498245,87.529836,2.996621,80.055114,7.00339,79.981318,79.939456,79.9447,80.019677,80.004838,2.504328,1.996735,3.993878
std,28290.307527,1.415319,1.116604,7.220848,1.153689,11.531197,1.156349,11.545166,11.540462,11.552735,11.530846,11.543158,0.864795,1.419676,1.413414
min,1.0,14.0,9.0,75.000057,1.000009,60.002122,5.000038,60.000437,60.000427,60.001036,60.000204,60.000327,1.000027,0.0,2.0
25%,24500.75,15.0,10.0,81.279963,1.99528,70.12731,5.999395,69.927298,69.917734,69.942322,70.029975,70.053283,1.759772,1.0,3.0
50%,49000.5,16.0,10.0,87.528388,2.994967,80.055771,7.00693,79.996518,79.932998,79.956681,80.011503,79.959596,2.50817,2.0,4.0
75%,73500.25,17.0,11.0,93.771893,3.995063,90.036651,8.004904,89.977932,89.882934,89.903344,90.015863,89.994592,3.251803,3.0,5.0
max,98000.0,18.0,12.0,99.999482,4.999978,99.999797,8.999964,99.998589,99.999538,99.999266,99.999967,99.999453,3.99993,4.0,6.0


### Exporting the dataset as csv

In [28]:
student_data.to_csv('clean_student_data.csv', index=False)

In [None]:
df = pd.read_csv('clean_student_data.csv')

### Exploring the relationship between Attendence rate and Student performance score

In [55]:
attendance_performance = df[['Attendance Rate', 'Performance Score']]

In [56]:
attendance_performance.head()

Unnamed: 0,Attendance Rate,Performance Score
0,80.4878,Low
1,96.242678,Medium
2,84.649681,High
3,86.158599,Medium
4,88.487638,Low


In [57]:
attendance_performance.tail()

Unnamed: 0,Attendance Rate,Performance Score
97995,86.889422,Medium
97996,99.268557,High
97997,84.697214,Medium
97998,81.720266,High
97999,91.174452,Medium


### Relationship between attendance and Performance score
- The mean and median for all the scores are almost identical. This means that student attendance rate alone is not a good predictor of student performance

In [62]:
attendance_means = attendance_performance.groupby('Performance Score')['Attendance Rate'].agg(['mean', 'median', 'std'])
attendance_means

Unnamed: 0_level_0,mean,median,std
Performance Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,87.542683,87.593746,7.215052
Low,87.523737,87.51413,7.23235
Medium,87.520985,87.468477,7.219274
