In [1]:
import pandas as pd

student_df = pd.read_csv('../data/raw/study_performance.csv')

student_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [2]:
# Variable to predict: Score
score_columns = [column for column in student_df.columns if column.endswith('score')]
score_columns

['math_score', 'reading_score', 'writing_score']

In [3]:
student_df['score'] = round(student_df[score_columns].sum(axis=1)/30)
student_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,score
0,female,group B,bachelor's degree,standard,none,72,72,74,7.0
1,female,group C,some college,standard,completed,69,90,88,8.0
2,female,group B,master's degree,standard,none,90,95,93,9.0
3,male,group A,associate's degree,free/reduced,none,47,57,44,5.0
4,male,group C,some college,standard,none,76,78,75,8.0
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,9.0
996,male,group C,high school,free/reduced,none,62,55,55,6.0
997,female,group C,high school,free/reduced,completed,59,71,65,6.0
998,female,group D,some college,standard,completed,68,78,77,7.0


In [4]:
# Drop columns score_columns
student_df.drop(columns=score_columns, inplace=True)
student_df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,score
0,female,group B,bachelor's degree,standard,none,7.0
1,female,group C,some college,standard,completed,8.0
2,female,group B,master's degree,standard,none,9.0
3,male,group A,associate's degree,free/reduced,none,5.0
4,male,group C,some college,standard,none,8.0
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,9.0
996,male,group C,high school,free/reduced,none,6.0
997,female,group C,high school,free/reduced,completed,6.0
998,female,group D,some college,standard,completed,7.0


## **Categorical Variables**

In [16]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)

In [6]:
encoded_data = encoder.fit_transform(student_df[['gender']])
encoded_data

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], shape=(1000, 2))

In [7]:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['gender']))
encoded_df

Unnamed: 0,gender_female,gender_male
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0
...,...,...
995,1.0,0.0
996,0.0,1.0
997,1.0,0.0
998,1.0,0.0


In [8]:
# Concatenate encoded_df with student_df and drop gender column
student_df = pd.concat([student_df, encoded_df], axis=1)
student_df.drop(columns=['gender'], inplace=True)
student_df

Unnamed: 0,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,score,gender_female,gender_male
0,group B,bachelor's degree,standard,none,7.0,1.0,0.0
1,group C,some college,standard,completed,8.0,1.0,0.0
2,group B,master's degree,standard,none,9.0,1.0,0.0
3,group A,associate's degree,free/reduced,none,5.0,0.0,1.0
4,group C,some college,standard,none,8.0,0.0,1.0
...,...,...,...,...,...,...,...
995,group E,master's degree,standard,completed,9.0,1.0,0.0
996,group C,high school,free/reduced,none,6.0,0.0,1.0
997,group C,high school,free/reduced,completed,6.0,1.0,0.0
998,group D,some college,standard,completed,7.0,1.0,0.0


In [9]:
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(student_df[['race_ethnicity']])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['race_ethnicity']))
student_df = pd.concat([student_df, encoded_df], axis=1)
student_df.drop(columns=['race_ethnicity'], inplace=True)
student_df

Unnamed: 0,parental_level_of_education,lunch,test_preparation_course,score,gender_female,gender_male,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E
0,bachelor's degree,standard,none,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,some college,standard,completed,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,master's degree,standard,none,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,associate's degree,free/reduced,none,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,some college,standard,none,8.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
995,master's degree,standard,completed,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
996,high school,free/reduced,none,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
997,high school,free/reduced,completed,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
998,some college,standard,completed,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(student_df[['lunch']])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['lunch']))
student_df = pd.concat([student_df, encoded_df], axis=1)
student_df.drop(columns=['lunch'], inplace=True)
student_df

Unnamed: 0,parental_level_of_education,test_preparation_course,score,gender_female,gender_male,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E,lunch_free/reduced,lunch_standard
0,bachelor's degree,none,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,some college,completed,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,master's degree,none,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,associate's degree,none,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,some college,none,8.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,master's degree,completed,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
996,high school,none,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
997,high school,completed,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
998,some college,completed,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [11]:
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(student_df[['test_preparation_course']])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['test_preparation_course']))
student_df = pd.concat([student_df, encoded_df], axis=1)
student_df.drop(columns=['test_preparation_course'], inplace=True)
student_df

Unnamed: 0,parental_level_of_education,score,gender_female,gender_male,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E,lunch_free/reduced,lunch_standard,test_preparation_course_completed,test_preparation_course_none
0,bachelor's degree,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,some college,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,master's degree,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,associate's degree,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,some college,8.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,master's degree,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
996,high school,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
997,high school,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
998,some college,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


## Ordinal Encoding

In [12]:
student_df['parental_level_of_education'].value_counts()

parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [13]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_categories = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
ordinal_encoder = OrdinalEncoder(categories=[ordinal_categories])
encoded_data = ordinal_encoder.fit_transform(student_df[['parental_level_of_education']])
df_encoded = pd.DataFrame(encoded_data, columns=['parental_level_of_education_encoded'])
df_encoded
student_df = pd.concat([student_df, df_encoded], axis=1)
student_df.drop('parental_level_of_education', axis=1, inplace=True)
student_df


Unnamed: 0,score,gender_female,gender_male,race_ethnicity_group A,race_ethnicity_group B,race_ethnicity_group C,race_ethnicity_group D,race_ethnicity_group E,lunch_free/reduced,lunch_standard,test_preparation_course_completed,test_preparation_course_none,parental_level_of_education_encoded
0,7.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,4.0
1,8.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0
2,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,5.0
3,5.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0
4,8.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,5.0
996,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
997,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
998,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,2.0


In [14]:
# Save the cleaned data to a new CSV file
student_df.to_csv('../data/processed/study_performance_cleaned.csv', index=False)