# Step 1 : Required Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Step 2 : Create a Student Performance Dataset

In [2]:
data = {
    'StudentID':[1,2,3,4,5],
    'Age':[18,19,np.nan,18, 19],
    'Gender':['Male', 'Female', 'Male', 'Female', 'Male'],
    'Grade Level':[12,11,13,10,11],
    'Test Score':[85,92,78,88,np.nan],
    'Absent Day':[2, np.nan,3,0,2],
    'Study Hours':[4,5,3,6,4],
    'GPA':[3.7, 3.9, 3.2,np.nan,3.8]
}

type(data)

dict

In [3]:
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,StudentID,Age,Gender,Grade Level,Test Score,Absent Day,Study Hours,GPA
0,1,18.0,Male,12,85.0,2.0,4,3.7
1,2,19.0,Female,11,92.0,,5,3.9
2,3,,Male,13,78.0,3.0,3,3.2
3,4,18.0,Female,10,88.0,0.0,6,
4,5,19.0,Male,11,,2.0,4,3.8


# Step 3 : Data Preprocessing

In [4]:
df.shape

(5, 8)

In [5]:
df.dtypes

StudentID        int64
Age            float64
Gender          object
Grade Level      int64
Test Score     float64
Absent Day     float64
Study Hours      int64
GPA            float64
dtype: object

In [6]:
df.isnull().sum()

StudentID      0
Age            1
Gender         0
Grade Level    0
Test Score     1
Absent Day     1
Study Hours    0
GPA            1
dtype: int64

In [7]:
df['Age'] = df['Age'].interpolate()
df['Test Score'] = df['Test Score'].interpolate()
df['GPA'] = df['GPA'].interpolate()
df

Unnamed: 0,StudentID,Age,Gender,Grade Level,Test Score,Absent Day,Study Hours,GPA
0,1,18.0,Male,12,85.0,2.0,4,3.7
1,2,19.0,Female,11,92.0,,5,3.9
2,3,18.5,Male,13,78.0,3.0,3,3.2
3,4,18.0,Female,10,88.0,0.0,6,3.5
4,5,19.0,Male,11,88.0,2.0,4,3.8


In [8]:
df['Absent Day'] = df['Absent Day'].fillna(method='ffill')
df

Unnamed: 0,StudentID,Age,Gender,Grade Level,Test Score,Absent Day,Study Hours,GPA
0,1,18.0,Male,12,85.0,2.0,4,3.7
1,2,19.0,Female,11,92.0,2.0,5,3.9
2,3,18.5,Male,13,78.0,3.0,3,3.2
3,4,18.0,Female,10,88.0,0.0,6,3.5
4,5,19.0,Male,11,88.0,2.0,4,3.8


In [9]:
cols = ['Age', 'Test Score', 'Absent Day', 'GPA']

for i in cols:
    df[i] = df[i].astype('int64')
    
df.dtypes

StudentID       int64
Age             int64
Gender         object
Grade Level     int64
Test Score      int64
Absent Day      int64
Study Hours     int64
GPA             int64
dtype: object

# Step 4 : Converting Categorical variables into Quantitative variables

In [10]:
quant_data = pd.get_dummies(df.Gender, prefix='Gender')
quant_data

Unnamed: 0,Gender_Female,Gender_Male
0,0,1
1,1,0
2,0,1
3,1,0
4,0,1


In [11]:
df = df.join(quant_data)
df.drop(['Gender'], axis=1, inplace=True)
df

Unnamed: 0,StudentID,Age,Grade Level,Test Score,Absent Day,Study Hours,GPA,Gender_Female,Gender_Male
0,1,18,12,85,2,4,3,0,1
1,2,19,11,92,2,5,3,1,0
2,3,18,13,78,3,3,3,0,1
3,4,18,10,88,0,6,3,1,0
4,5,19,11,88,2,4,3,0,1


In [12]:
df['Gender_Female'] = df['Gender_Female'].astype('int64')
df['Gender_Male'] = df['Gender_Male'].astype('int64')
df

Unnamed: 0,StudentID,Age,Grade Level,Test Score,Absent Day,Study Hours,GPA,Gender_Female,Gender_Male
0,1,18,12,85,2,4,3,0,1
1,2,19,11,92,2,5,3,1,0
2,3,18,13,78,3,3,3,0,1
3,4,18,10,88,0,6,3,1,0
4,5,19,11,88,2,4,3,0,1


In [13]:
df.dtypes

StudentID        int64
Age              int64
Grade Level      int64
Test Score       int64
Absent Day       int64
Study Hours      int64
GPA              int64
Gender_Female    int64
Gender_Male      int64
dtype: object

In [14]:
df.isnull().sum()

StudentID        0
Age              0
Grade Level      0
Test Score       0
Absent Day       0
Study Hours      0
GPA              0
Gender_Female    0
Gender_Male      0
dtype: int64