In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("student_performance_data.csv")

In [32]:
df.head(5)

Unnamed: 0,studentid,age,gender,ethnicity,parentaleducation,studytimeweekly,absences,tutoring,parentalsupport,extracurricular,sports,music,volunteering,gpa,gradeclass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [10]:
df.(5)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.27915,4.0
2389,3390,16,1,0,2,6.8055,20,0,2,0,0,0,1,1.142333,2.0
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0
2391,3392,16,1,0,2,17.819907,13,0,2,0,0,0,1,2.140014,1.0


## Feature Type Identification

The columns were manually inspected by analysing column names and sample values. based on their nature, features are classified into numerical ,categorical ,ordinal and binary types

 | Column Name       | Feature Type             | Reason                                  |
|------------------|--------------------------|-----------------------------------------|
| StudentID        | Categorical (Identifier) | Unique ID, not useful for ML prediction |
| Age              | Numerical                | Integer age values                      |
| Gender           | Categorical              | Labels like Male/Female                 |
| Ethnicity        | Categorical              | Nominal group labels                    |
| ParentalEducation| Ordinal                  | Education levels have order             |
| StudyTimeWeekly  | Numerical                | Continuous numeric value                |
| Absences         | Numerical                | Count-based numeric                     |
| Tutoring         | Binary                   | Yes / No                                |
| ParentalSupport  | Ordinal                  | Levels like Low, Medium, High           |
| Extracurricular  | Binary                   | Yes / No                                |
| Sports           | Binary                   | Yes / No                                |
| Music            | Binary                   | Yes / No                                |
| Volunteering     | Binary                   | Yes / No                                |
| GPA              | Numerical                | Continuous academic score               |
| GradeClass       | Ordinal (Target)         | Ordered performance levels              |


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [None]:
# there is no missing value in any column 

In [15]:
df.describe() #Statistical summary of numerical features

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645,0.51087,0.877508,1.746237,9.771992,14.541388,0.301421,2.122074,0.383361,0.303512,0.196906,0.157191,1.906186,2.983696
std,690.655244,1.123798,0.499986,1.028476,1.000411,5.652774,8.467417,0.458971,1.122813,0.486307,0.45987,0.397744,0.364057,0.915156,1.233908
min,1001.0,15.0,0.0,0.0,0.0,0.001057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1598.75,15.0,0.0,0.0,1.0,5.043079,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.174803,2.0
50%,2196.5,16.0,1.0,0.0,2.0,9.705363,15.0,0.0,2.0,0.0,0.0,0.0,0.0,1.893393,4.0
75%,2794.25,17.0,1.0,2.0,2.0,14.40841,22.0,1.0,3.0,1.0,1.0,0.0,0.0,2.622216,4.0
max,3392.0,18.0,1.0,3.0,4.0,19.978094,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


In [29]:
df.columns = df.columns.str.lower()

In [30]:
df.columns

Index(['studentid', 'age', 'gender', 'ethnicity', 'parentaleducation',
       'studytimeweekly', 'absences', 'tutoring', 'parentalsupport',
       'extracurricular', 'sports', 'music', 'volunteering', 'gpa',
       'gradeclass'],
      dtype='object')

# Inspect categorical data 
## Why this metters 

1. Decide encoding strategy letter 
2. understand the diversity of categories
3. detect class imbalance 

In [43]:
cat_columns = ['gender','ethnicity','parentaleducation','gradeclass']
for col in cat_columns:
    print(df[col])
    print(df[col].value_counts())
    print()


0       1
1       0
2       0
3       1
4       1
       ..
2387    1
2388    0
2389    1
2390    1
2391    1
Name: gender, Length: 2392, dtype: int64
gender
1    1222
0    1170
Name: count, dtype: int64

0       0
1       0
2       2
3       0
4       0
       ..
2387    0
2388    0
2389    0
2390    1
2391    0
Name: ethnicity, Length: 2392, dtype: int64
ethnicity
0    1207
1     493
2     470
3     222
Name: count, dtype: int64

0       2
1       1
2       3
3       3
4       2
       ..
2387    3
2388    1
2389    2
2390    0
2391    2
Name: parentaleducation, Length: 2392, dtype: int64
parentaleducation
2    934
1    728
3    367
0    243
4    120
Name: count, dtype: int64

0       2.0
1       1.0
2       4.0
3       3.0
4       4.0
       ... 
2387    0.0
2388    4.0
2389    2.0
2390    1.0
2391    1.0
Name: gradeclass, Length: 2392, dtype: float64
gradeclass
4.0    1211
3.0     414
2.0     391
1.0     269
0.0     107
Name: count, dtype: int64



## Identify Target Variable & Input Features
# Target Variable
GradeClass

Reason:

1. Represents student performance category
2. Suitable for classification models

Input Features

All columns except:
1. StudentID (identifier)
2. GradeClass (target)

In [46]:
df.shape

(2392, 15)

# Discussion

1. dataset has sufficient rows and features
2. mix of numerical and categorical variables
3. suitable for
   1. classification models
   2. feature engineering
   3. educational performance analysis

# Observations 

1. dataset has no duplicate and null values
2. binary columns are well defined
3. ordinal features required label encoding
4. categorical features required onehotencoding
5. GPA distribution should be checked for outliers
6. Possible class imbalance in GradeClass