In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv('performance.csv')

In [107]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 34 columns):
 #   Column                                                                      Non-Null Count  Dtype  
---  ------                                                                      --------------  -----  
 0   Unnamed: 0                                                                  1009 non-null   int64  
 1   Adm_Year                                                                    1009 non-null   float64
 2   Gender                                                                      1009 non-null   object 
 3   Age                                                                         1009 non-null   float64
 4   HSC_Pass_Year                                                               1009 non-null   float64
 5   Program                                                                     1009 non-null   object 
 6   Merit_Scholarship                               

In [108]:
#summary statistic numerical columns
data.describe()

Unnamed: 0.1,Unnamed: 0,Adm_Year,Age,HSC_Pass_Year,Curr_Sem1,Study_Hours1,Study_Sessions1,SM_Hours1,Avg_Attendance1,Skill_Dev_Hours1,Prev_SGPA1,Curr_CGPA1,Completed_Credits1,Family_Income1
count,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0,1009.0
mean,673.108028,2040.32111,21.368285,2019.251734,43.000991,3.334616,2.066898,3.439296,88.111001,2.224975,2.756482,3.211343,76.936571,63495.76
std,311.377223,629.677177,1.614943,1.346681,266.874155,2.096762,1.034492,2.439363,16.079094,1.473957,0.858012,0.731698,47.733885,79276.58
min,7.0,2013.0,18.0,2012.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2530.0
25%,410.0,2020.0,20.0,2019.0,3.0,2.0,1.0,2.0,80.0,1.0,2.11,2.88,24.0,30000.0
50%,685.0,2021.0,21.0,2020.0,8.0,3.0,2.0,3.0,95.0,2.0,2.77,3.39,85.0,50000.0
75%,941.0,2022.0,22.0,2020.0,10.0,4.0,2.0,4.0,100.0,3.0,3.48,3.71,122.0,77000.0
max,1193.0,22022.0,26.0,2028.0,2022.0,30.0,10.0,20.0,100.0,20.0,5.0,4.67,147.0,2000000.0


In [109]:
#first few rows
data.head()

Unnamed: 0.1,Unnamed: 0,Adm_Year,Gender,Age,HSC_Pass_Year,Program,Merit_Scholarship,Uni_Transport,Learn_Mode,Smartphone,...,SM_Hours1,Avg_Attendance1,Skills1,Skill_Dev_Hours1,Interest_Area1,Prev_SGPA1,Curr_CGPA1,Completed_Credits1,Diploma,Family_Income1
0,7,2021.0,Female,22.0,2019.0,BCSE,Yes,No,Offline,Yes,...,2.0,100.0,Web development skill(Frontend),1.0,Networking,3.8,3.64,35.0,False,32500.0
1,11,2021.0,Male,22.0,2019.0,BCSE,No,No,Offline,Yes,...,2.0,90.0,Programming,1.0,Data Science,3.4,3.53,35.0,False,20000.0
2,15,2021.0,Male,20.0,2020.0,BCSE,Yes,Yes,Offline,Yes,...,1.0,95.0,Programming,3.0,Machine Learning / Deep Learning,3.93,3.89,35.0,False,30000.0
3,18,2021.0,Male,21.0,2020.0,BCSE,Yes,No,Online,Yes,...,3.0,95.0,Programming,1.0,Artificial Intelligence,3.1,3.5,35.0,False,25000.0
4,20,2021.0,Female,21.0,2019.0,BCSE,Yes,No,Offline,Yes,...,2.0,96.0,Web development skill(Frontend),1.0,Web Development,3.81,3.65,34.0,False,30000.0


In [110]:
#check for missing values
data.isnull().sum()

Unnamed: 0                                                                    0
Adm_Year                                                                      0
Gender                                                                        0
Age                                                                           0
HSC_Pass_Year                                                                 0
Program                                                                       0
Merit_Scholarship                                                             0
Uni_Transport                                                                 0
Learn_Mode                                                                    0
Smartphone                                                                    0
PC                                                                            0
Eng_Proficiency                                                               0
Probation                               

In [111]:
#handle missing values
categorical_cols = data.select_dtypes(include=['object']).columns

for col in categorical_cols:
    data[col].fillna(data[col].mode()[0],inplace =True)

data.isnull().sum()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0],inplace =True)


Unnamed: 0                                                                    0
Adm_Year                                                                      0
Gender                                                                        0
Age                                                                           0
HSC_Pass_Year                                                                 0
Program                                                                       0
Merit_Scholarship                                                             0
Uni_Transport                                                                 0
Learn_Mode                                                                    0
Smartphone                                                                    0
PC                                                                            0
Eng_Proficiency                                                               0
Probation                               

In [112]:
#target distribution
data['Target'].value_counts()

Target
Poor         503
Average      272
Good         178
Excellent     56
Name: count, dtype: int64

In [113]:
#encoding target varibale
label_encoder = LabelEncoder()
data['Target'] = label_encoder.fit_transform(data['Target'])

print(data['Target'])


0       2
1       2
2       2
3       0
4       2
       ..
1004    3
1005    3
1006    3
1007    3
1008    3
Name: Target, Length: 1009, dtype: int32


In [114]:
#Standardize numerical features
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns

standard_scaler = StandardScaler()
data[numerical_cols] = standard_scaler.fit_transform(data[numerical_cols])

print(numerical_cols)

Index(['Unnamed: 0', 'Adm_Year', 'Age', 'HSC_Pass_Year', 'Curr_Sem1',
       'Study_Hours1', 'Study_Sessions1', 'SM_Hours1', 'Avg_Attendance1',
       'Skill_Dev_Hours1', 'Prev_SGPA1', 'Curr_CGPA1', 'Completed_Credits1',
       'Family_Income1'],
      dtype='object')


In [None]:
#split data into 70% training and 30% testing

x = data.drop(columns=['Target']) 
y = data['Target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)