# LOADING DATA

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('mod1_encoded_sem1.csv')

In [3]:
df = df.drop('DEPENDENTS',axis=1)
df = df.drop('YEAR',axis=1)
df = df.drop('REGION',axis=1)
df = df.drop('SCHOOL',axis=1)
df = df.drop('LEARNING STYLE',axis=1)
df = df.drop('GENDER',axis=1)

In [4]:
df.head()

Unnamed: 0,PROGRAM,RACE,RELIGION,STATE,INCOME,SPM ADDMATH,SPM PHY,SPM CHEM,SPM BIO,SEM1 CGPA
0,1,3,2,1,6,6,5,5,3,1
1,1,3,2,1,1,5,4,6,5,1
2,1,3,2,1,7,7,2,4,5,1
3,1,3,2,8,2,5,3,6,4,2
4,1,3,2,1,3,3,4,6,5,1


In [5]:
df.shape

(1241, 10)

In [6]:
X = df.drop('SEM1 CGPA',axis=1)
y = df['SEM1 CGPA']

# STANDARDIZING THE VARIABLE

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()
scaler.fit(df.drop('SEM1 CGPA',axis=1))

StandardScaler()

In [9]:
scaled_features = scaler.transform(df.drop('SEM1 CGPA',axis=1))

In [10]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

Unnamed: 0,PROGRAM,RACE,RELIGION,STATE,INCOME,SPM ADDMATH,SPM PHY,SPM CHEM,SPM BIO
0,0.987985,0.380774,0.268071,-0.48464,1.337215,0.835125,1.609341,0.862124,0.338659
1,0.987985,0.380774,0.268071,-0.48464,-0.841157,0.403116,0.929098,1.355846,1.664452
2,0.987985,0.380774,0.268071,-0.48464,1.77289,1.267134,-0.431387,0.368402,1.664452
3,0.987985,0.380774,0.268071,1.660072,-0.405483,0.403116,0.248856,1.355846,1.001556
4,0.987985,0.380774,0.268071,-0.48464,0.030192,-0.460903,0.929098,1.355846,1.664452


# HOLD OUT VALIDATION (TRAIN TEST SPLIT)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train,X_test,y_train,y_test = train_test_split(df_feat,df['SEM1 CGPA'],test_size=0.2,random_state=0)

In [13]:
X_train.head()

Unnamed: 0,PROGRAM,RACE,RELIGION,STATE,INCOME,SPM ADDMATH,SPM PHY,SPM CHEM,SPM BIO
552,0.987985,0.380774,0.268071,-0.48464,-0.405483,-0.028893,1.609341,1.355846,1.664452
1116,-1.012161,0.380774,0.268071,-0.48464,-0.841157,-0.460903,0.929098,-0.12532,-0.324237
492,0.987985,0.380774,0.268071,-0.48464,-1.276832,0.835125,0.248856,0.862124,1.001556
947,-1.012161,0.380774,0.268071,-0.48464,-1.276832,0.403116,0.248856,0.368402,-0.324237
1162,-1.012161,0.380774,0.268071,-0.48464,-0.405483,-0.028893,0.929098,0.368402,0.338659


In [14]:
y_train

552     1
1116    2
492     0
947     2
1162    1
       ..
763     3
835     1
1216    0
559     1
684     1
Name: SEM1 CGPA, Length: 992, dtype: int64

# MODEL FITTING

In [15]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import classification_report

import warnings # to hide unnecesary warning
warnings.filterwarnings('ignore')

In [16]:
svc = SVC()
svc.fit(X_train,y_train)

SVC()

In [17]:
y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)

0.5060240963855421

# EVALUATION 1

In [18]:
print(confusion_matrix(y_test,y_pred))

[[76 44  3  0]
 [30 48  0  0]
 [ 8 33  2  0]
 [ 2  3  0  0]]


In [19]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.66      0.62      0.64       123
           1       0.38      0.62      0.47        78
           2       0.40      0.05      0.08        43
           3       0.00      0.00      0.00         5

    accuracy                           0.51       249
   macro avg       0.36      0.32      0.30       249
weighted avg       0.51      0.51      0.47       249



# HYPERPARAMETER TUNING (GRID SEARCH CV)

In [20]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = [{'C':[0.001,0.01,0.1,1,10,100],'kernel':['linear','poly','rbf','sigmoid'],
           'gamma':[0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,100]}]
           
grid_search = GridSearchCV(estimator=svc,param_grid=params,scoring='accuracy',cv=10,n_jobs=-1)

grid_search = grid_search.fit(X_train,y_train)

In [None]:
print(grid_search.best_params_)

# EVALUATION 2

In [None]:
svc = SVC(kernel='sigmoid',C=10, gamma=0.01)
svc.fit(X_train,y_train)

In [None]:
y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test,y_pred))