In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load the dataset
data = pd.read_csv('Resources/student-mat.csv', delimiter=';')

In [3]:
# Remove unnecessary columns
columns_to_remove = ["G1", "G2", "Walc", "address", "famrel", "Dalc", "guardian", "famsize"]
data.drop(columns=[col for col in columns_to_remove if col in data.columns], errors='ignore', inplace=True)

In [4]:
# Rename columns
data.rename(columns={"G3": "final_grade", "studytime": "Study_Time_Hours", "Fedu": "Father_Edu", "Medu": "Mother_Edu", "Pstatus": "Parent_status"}, inplace=True)

In [5]:
# Convert 'yes'/'no' to 0's and 1's
binary_columns = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
for col in binary_columns:
    if col in data.columns:
        data[col] = data[col].map({'yes': 1, 'no': 0})

In [6]:
# Handle missing values: Fill with mean for 'absences'
if 'absences' in data.columns:
    data['absences'].fillna(data['absences'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['absences'].fillna(data['absences'].mean(), inplace=True)


In [7]:
# Normalize numerical features (example: scale 'absences' between 0 and 1)
# scaler = MinMaxScaler()
# numerical_columns = ['absences', 'Study_Time_Hours', 'final_grade']
# for col in numerical_columns:
#     if col in data.columns:
#         data[col] = scaler.fit_transform(data[[col]])

In [8]:
# Show the first few rows of the cleaned data
data.head()

Unnamed: 0,school,sex,age,Parent_status,Mother_Edu,Father_Edu,Mjob,Fjob,reason,traveltime,...,activities,nursery,higher,internet,romantic,freetime,goout,health,absences,final_grade
0,GP,F,18,A,4,4,at_home,teacher,course,2,...,0,1,1,0,0,3,4,3,6,6
1,GP,F,17,T,1,1,at_home,other,course,1,...,0,0,1,1,0,3,3,3,4,6
2,GP,F,15,T,1,1,at_home,other,other,1,...,0,1,1,1,0,3,2,3,10,10
3,GP,F,15,T,4,2,health,services,home,1,...,1,1,1,1,1,2,2,5,2,15
4,GP,F,16,T,3,3,other,other,home,1,...,0,1,1,0,0,3,2,5,4,10


In [9]:
# Export the cleaned dataset
data.to_csv('cleaned_student_mat.csv', index=False)
