In [4]:
#@title Import Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

In [5]:
#@title Load Dataset
data = pd.read_csv("dataset.csv")

In [6]:
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Gender,Scholarship holder,Age at enrollment,International,Unemployment rate,Inflation rate,GDP,Target,Unnamed: 25,Unnamed: 26
0,1.0,17.0,5.0,171.0,1.0,1.0,122.0,1.0,19.0,12.0,...,1.0,0.0,,0.0,10.8,1.4,1.74,Dropout,,
1,1.0,15.0,1.0,9254.0,1.0,1.0,160.0,1.0,1.0,3.0,...,1.0,0.0,19.0,0.0,13.9,-0.3,0.79,Graduate,,
2,1.0,,5.0,9070.0,1.0,1.0,122.0,,37.0,37.0,...,1.0,,19.0,0.0,10.8,1.4,1.74,Dropout,,
3,1.0,17.0,2.0,9773.0,1.0,1.0,122.0,1.0,38.0,37.0,...,,0.0,20.0,0.0,9.4,-0.8,-3.12,Graduate,,
4,2.0,39.0,1.0,8014.0,0.0,1.0,100.0,1.0,37.0,,...,0.0,0.0,,0.0,13.9,-0.3,0.79,Graduate,,


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Marital status                  4001 non-null   float64
 1   Application mode                3941 non-null   float64
 2   Application order               3998 non-null   float64
 3   Course                          3959 non-null   float64
 4   Daytime/evening attendance	     3984 non-null   float64
 5   Previous qualification          3990 non-null   float64
 6   Previous qualification (grade)  3952 non-null   float64
 7   Nacionality                     3978 non-null   float64
 8   Mother's qualification          4010 non-null   float64
 9   Father's qualification          3974 non-null   float64
 10  Mother's occupation             3988 non-null   float64
 11  Father's occupation             3999 non-null   float64
 12  Admission grade                 39

In [8]:
data.drop(columns=['Unnamed: 25','Unnamed: 26'], inplace=True)

In [9]:
data.isnull().sum()

Marital status                    423
Application mode                  483
Application order                 426
Course                            465
Daytime/evening attendance\t      440
Previous qualification            434
Previous qualification (grade)    472
Nacionality                       446
Mother's qualification            414
Father's qualification            450
Mother's occupation               436
Father's occupation               425
Admission grade                   462
Displaced                         442
Educational special needs         448
Debtor                            452
Tuition fees up to date           426
Gender                            437
Scholarship holder                450
Age at enrollment                 444
International                     428
Unemployment rate                 429
Inflation rate                    422
GDP                               456
Target                            453
dtype: int64

In [10]:
#@title Remove Null Targets Column
data = data.dropna(subset=['Target'])

In [11]:
#@title Numerical Feature List and imputation

num_feats = [
    'Admission grade',
    'Age at enrollment',
    'Unemployment rate',
    'Inflation rate',
    'GDP',
    'Previous qualification (grade)',
    'Application order'
]

In [12]:
#@title Before Imputation
data[num_feats].head()

Unnamed: 0,Admission grade,Age at enrollment,Unemployment rate,Inflation rate,GDP,Previous qualification (grade),Application order
0,127.3,,10.8,1.4,1.74,122.0,5.0
1,142.5,19.0,13.9,-0.3,0.79,160.0,1.0
2,124.8,19.0,10.8,1.4,1.74,122.0,5.0
3,119.6,20.0,9.4,-0.8,-3.12,122.0,2.0
4,141.5,,13.9,-0.3,0.79,100.0,1.0


In [13]:
#@title Imputation
num_imp = SimpleImputer(strategy='median')
data[num_feats] = num_imp.fit_transform(data[num_feats])

In [14]:
#@title After Imputation
data[num_feats].head()

Unnamed: 0,Admission grade,Age at enrollment,Unemployment rate,Inflation rate,GDP,Previous qualification (grade),Application order
0,127.3,20.0,10.8,1.4,1.74,122.0,5.0
1,142.5,19.0,13.9,-0.3,0.79,160.0,1.0
2,124.8,19.0,10.8,1.4,1.74,122.0,5.0
3,119.6,20.0,9.4,-0.8,-3.12,122.0,2.0
4,141.5,20.0,13.9,-0.3,0.79,100.0,1.0


In [15]:
#@title Categorical Features
cat_feats = [col for col in data.columns if col not in num_feats + ['Target']]
cat_feats

['Marital status',
 'Application mode',
 'Course',
 'Daytime/evening attendance\t',
 'Previous qualification',
 'Nacionality',
 "Mother's qualification",
 "Father's qualification",
 "Mother's occupation",
 "Father's occupation",
 'Displaced',
 'Educational special needs',
 'Debtor',
 'Tuition fees up to date',
 'Gender',
 'Scholarship holder',
 'International']

In [16]:
#@title Before
data[cat_feats].head()

Unnamed: 0,Marital status,Application mode,Course,Daytime/evening attendance\t,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,International
0,1.0,17.0,171.0,1.0,1.0,1.0,19.0,12.0,5.0,9.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,15.0,9254.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,,9070.0,1.0,1.0,,37.0,37.0,,9.0,1.0,0.0,0.0,0.0,1.0,,0.0
3,1.0,17.0,9773.0,1.0,1.0,1.0,38.0,37.0,5.0,,1.0,,0.0,1.0,,0.0,0.0
4,2.0,39.0,8014.0,0.0,1.0,1.0,37.0,,9.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
#@title Impute missing values with the most frequent category
cat_imp = SimpleImputer(strategy='most_frequent')
data[cat_feats] = cat_imp.fit_transform(data[cat_feats])

In [18]:
#@title After
data[cat_feats].head()

Unnamed: 0,Marital status,Application mode,Course,Daytime/evening attendance\t,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,International
0,1.0,17.0,171.0,1.0,1.0,1.0,19.0,12.0,5.0,9.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1.0,15.0,9254.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.0,9070.0,1.0,1.0,1.0,37.0,37.0,9.0,9.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,17.0,9773.0,1.0,1.0,1.0,38.0,37.0,5.0,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2.0,39.0,8014.0,0.0,1.0,1.0,37.0,37.0,9.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [19]:
data.isnull().sum()

Marital status                    0
Application mode                  0
Application order                 0
Course                            0
Daytime/evening attendance\t      0
Previous qualification            0
Previous qualification (grade)    0
Nacionality                       0
Mother's qualification            0
Father's qualification            0
Mother's occupation               0
Father's occupation               0
Admission grade                   0
Displaced                         0
Educational special needs         0
Debtor                            0
Tuition fees up to date           0
Gender                            0
Scholarship holder                0
Age at enrollment                 0
International                     0
Unemployment rate                 0
Inflation rate                    0
GDP                               0
Target                            0
dtype: int64