In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')



In [3]:
# Load the Dataset
df=pd.read_csv("Students_Performance_Dataset.csv")
df

Unnamed: 0,Student_ID,Student_Age,Sex,High_School_Type,Scholarship,Additional_Work,Sports_activity,Transportation,Weekly_Study_Hours,Attendance,Reading,Notes,Listening_in_Class,Project_work,Grade
0,STUDENT1,19-22,Male,Other,50%,Yes,No,Private,0,Always,Yes,Yes,No,No,AA
1,STUDENT2,19-22,Male,Other,50%,Yes,No,Private,0,Always,Yes,No,Yes,Yes,AA
2,STUDENT3,19-22,Male,State,50%,No,No,Private,2,Never,No,No,No,Yes,AA
3,STUDENT4,18,Female,Private,50%,Yes,No,Bus,2,Always,No,Yes,No,No,AA
4,STUDENT5,19-22,Male,Private,50%,No,No,Bus,12,Always,Yes,No,Yes,Yes,AA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,STUDENT141,19-22,Female,State,50%,Yes,Yes,Private,0,Always,No,Yes,No,Yes,CC
141,STUDENT142,18,Female,State,75%,No,No,Private,0,Never,No,Yes,Yes,No,CC
142,STUDENT143,18,Female,Private,75%,No,No,Private,0,Always,Yes,No,No,No,AA
143,STUDENT144,19-22,Female,State,75%,Yes,Yes,Bus,12,Sometimes,No,Yes,No,Yes,CB


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Student_ID          145 non-null    object
 1   Student_Age         145 non-null    object
 2   Sex                 145 non-null    object
 3   High_School_Type    145 non-null    object
 4   Scholarship         144 non-null    object
 5   Additional_Work     145 non-null    object
 6   Sports_activity     145 non-null    object
 7   Transportation      145 non-null    object
 8   Weekly_Study_Hours  145 non-null    int64 
 9   Attendance          145 non-null    object
 10  Reading             145 non-null    object
 11  Notes               145 non-null    object
 12  Listening_in_Class  145 non-null    object
 13  Project_work        145 non-null    object
 14  Grade               145 non-null    object
dtypes: int64(1), object(14)
memory usage: 17.1+ KB


In [5]:
# 4. Check Total Missing Values
print("Missing values:\n", df.isnull().sum())



Missing values:
 Student_ID            0
Student_Age           0
Sex                   0
High_School_Type      0
Scholarship           1
Additional_Work       0
Sports_activity       0
Transportation        0
Weekly_Study_Hours    0
Attendance            0
Reading               0
Notes                 0
Listening_in_Class    0
Project_work          0
Grade                 0
dtype: int64


In [6]:
# 5. Fix Data Types for Specific Columns
# Example: Convert 'Student_Age' to categorical if needed
df['Student_Age'] = df['Student_Age'].astype(str)
df['Scholarship'] = df['Scholarship'].astype(str)
df['Weekly_Study_Hours'] = pd.to_numeric(df['Weekly_Study_Hours'], errors='coerce')


In [7]:
# 6. Handle Missing Values
# For categorical columns, fill with mode
cat_cols = ['Student_Age', 'Sex', 'High_School_Type', 'Scholarship', 'Additional_Work',
            'Sports_activity', 'Transportation', 'Attendance', 'Reading', 'Notes',
            'Listening_in_Class', 'Project_work', 'Grade']

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# For numerical column
df['Weekly_Study_Hours'] = df['Weekly_Study_Hours'].fillna(df['Weekly_Study_Hours'].median())



In [8]:
# 7. Drop any rows with critical missing values if still present
df.dropna(inplace=True)


In [9]:
# 8. Check Again
print("Missing values after cleaning:\n", df.isnull().sum())


Missing values after cleaning:
 Student_ID            0
Student_Age           0
Sex                   0
High_School_Type      0
Scholarship           0
Additional_Work       0
Sports_activity       0
Transportation        0
Weekly_Study_Hours    0
Attendance            0
Reading               0
Notes                 0
Listening_in_Class    0
Project_work          0
Grade                 0
dtype: int64


In [10]:
# 9. Convert Categorical Variables to Encoded Values
from sklearn.preprocessing import LabelEncoder

label_cols = ['Student_Age', 'Sex', 'High_School_Type', 'Scholarship',
              'Additional_Work', 'Sports_activity', 'Transportation',
              'Attendance', 'Reading', 'Notes', 'Listening_in_Class',
              'Project_work', 'Grade']

# Create a dictionary to save label encoders for possible inverse transform
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le



In [11]:
# 10. Drop original categorical columns (optional, keep if needed for interpretation)
df.drop(columns=label_cols, inplace=True)


In [12]:
# 11. Check for Duplicates
print("Duplicate records:", df.duplicated().sum())



Duplicate records: 0


In [15]:
# 12. Final Check
print(df.info())
print(df.head(145))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Student_ID                  145 non-null    object
 1   Weekly_Study_Hours          145 non-null    int64 
 2   Student_Age_encoded         145 non-null    int64 
 3   Sex_encoded                 145 non-null    int64 
 4   High_School_Type_encoded    145 non-null    int64 
 5   Scholarship_encoded         145 non-null    int64 
 6   Additional_Work_encoded     145 non-null    int64 
 7   Sports_activity_encoded     145 non-null    int64 
 8   Transportation_encoded      145 non-null    int64 
 9   Attendance_encoded          145 non-null    int64 
 10  Reading_encoded             145 non-null    int64 
 11  Notes_encoded               145 non-null    int64 
 12  Listening_in_Class_encoded  145 non-null    int64 
 13  Project_work_encoded        145 non-null    int64 

In [14]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Student_ID                  145 non-null    object
 1   Weekly_Study_Hours          145 non-null    int64 
 2   Student_Age_encoded         145 non-null    int64 
 3   Sex_encoded                 145 non-null    int64 
 4   High_School_Type_encoded    145 non-null    int64 
 5   Scholarship_encoded         145 non-null    int64 
 6   Additional_Work_encoded     145 non-null    int64 
 7   Sports_activity_encoded     145 non-null    int64 
 8   Transportation_encoded      145 non-null    int64 
 9   Attendance_encoded          145 non-null    int64 
 10  Reading_encoded             145 non-null    int64 
 11  Notes_encoded               145 non-null    int64 
 12  Listening_in_Class_encoded  145 non-null    int64 
 13  Project_work_encoded        145 non-null    int64 

In [13]:
# 13. Save the Cleaned Data
df.to_csv("academic_achievement_cleaned.csv", index=False)
