### Import & Load the Dataset

In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv("StudentsPerformance.csv")

### Data Understanding & Cleaning

In [29]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [31]:
df.shape

(1000, 8)

In [32]:
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [33]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [34]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [35]:
df = df.drop_duplicates()
print("new shape:" , df.shape)

new shape: (1000, 8)


In [36]:
print(df['gender'].unique())
print(df['race/ethnicity'].unique())
print(df['parental level of education'].unique())
print(df['lunch'].unique())
print(df['test preparation course'].unique())

['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['standard' 'free/reduced']
['none' 'completed']


### Feature Engineering

In [37]:
df['Total_score']= df['math score'] + df['writing score'] +df['reading score']

In [38]:
df['percentage'] = df['Total_score']/300*100

In [39]:

conditions = [
    df['percentage'] >= 70,
    (df['percentage'] >= 50) & (df['percentage'] < 70),
    df['percentage'] < 50
]

categories = ['High', 'Medium', 'Low']
df['performance_category'] = np.select(conditions, categories)


In [40]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Total_score,percentage,performance_category
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667,High
1,female,group C,some college,standard,completed,69,90,88,247,82.333333,High
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667,High
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333,Low
4,male,group C,some college,standard,none,76,78,75,229,76.333333,High


### Expolatory Data Analysis

#### Which parental education level is linked with the highest average math score?

In [41]:
df.groupby('parental level of education')['math score'].mean().sort_values(ascending=False)

parental level of education
master's degree       69.745763
bachelor's degree     69.389831
associate's degree    67.882883
some college          67.128319
some high school      63.497207
high school           62.137755
Name: math score, dtype: float64

#### Is there a significant score difference between males and females across all subjects?

In [42]:
df.groupby('gender')[['math score','reading score','writing score']].mean()

Unnamed: 0_level_0,math score,reading score,writing score
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


#### How much does completing the test preparation course improve performance in each subject?

In [43]:
df.groupby('test preparation course')[['math score','reading score','writing score']].mean()

Unnamed: 0_level_0,math score,reading score,writing score
test preparation course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
completed,69.695531,73.893855,74.418994
none,64.077882,66.534268,64.504673


#### Which combination of gender, lunch type, and test preparation status produces the top 10% of scores?

In [44]:
df[df['percentage'] >= df['percentage'].quantile(0.90)] \
  .groupby(['test preparation course', 'gender', 'lunch']) \
  .size() \
  .sort_values(ascending=False)


test preparation course  gender  lunch       
none                     female  standard        31
completed                female  standard        29
                         male    standard        20
none                     male    standard         9
completed                female  free/reduced     6
                         male    free/reduced     3
none                     female  free/reduced     2
                         male    free/reduced     2
dtype: int64

#### Does lunch type have a uniform impact across all race/ethnicity groups, or does its effect vary?

In [45]:
df.groupby(['race/ethnicity', 'lunch'])[['math score', 'reading score', 'writing score']].mean().round(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,math score,reading score,writing score
race/ethnicity,lunch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group A,free/reduced,55.2,60.6,57.2
group A,standard,66.0,67.5,66.4
group B,free/reduced,57.4,64.0,61.5
group B,standard,66.9,69.3,67.9
group C,free/reduced,56.4,63.4,61.4
group C,standard,68.9,72.3,71.4
group D,free/reduced,61.1,66.4,66.5
group D,standard,70.9,72.1,72.2
group E,free/reduced,66.6,68.7,67.2
group E,standard,76.8,74.8,73.2


#### What is the correlation between reading and writing scores? Is it stronger than math and writing?

In [46]:
df[['reading score', 'writing score', 'math score']].corr()


Unnamed: 0,reading score,writing score,math score
reading score,1.0,0.954598,0.81758
writing score,0.954598,1.0,0.802642
math score,0.81758,0.802642,1.0


#### Identify the top 5% performing students and analyze their demographic profiles. What patterns emerge?

In [47]:
top_5_percent = df[df['percentage'] >= df['percentage'].quantile(0.95)]

In [48]:
top_5_percent.groupby(['gender', 'race/ethnicity', 'lunch', 'parental level of education']).size()

gender  race/ethnicity  lunch         parental level of education
female  group A         standard      some high school               1
        group B         standard      associate's degree             2
                                      bachelor's degree              1
                                      master's degree                1
                                      some college                   1
        group C         standard      associate's degree             2
                                      bachelor's degree              3
                                      some college                   2
        group D         free/reduced  bachelor's degree              1
                                      master's degree                1
                        standard      associate's degree             1
                                      bachelor's degree              1
                                      high school                    1
           

Can we cluster students into performance categories (e.g., low, medium, high performers) using just

Pandas logic? If yes, how?

In [49]:
df['performance_category'].value_counts()

performance_category
High      459
Medium    438
Low       103
Name: count, dtype: int64

In [50]:
df.to_csv('cleaned_student_data.csv', index=False)