# LOADING DATA

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
df = pd.read_csv('mod1_encoded_sem2.csv')

In [26]:
df = df.drop('DEPENDENTS',axis=1)
df = df.drop('YEAR',axis=1)
df = df.drop('REGION',axis=1)
df = df.drop('SCHOOL',axis=1)
df = df.drop('LEARNING STYLE',axis=1)
df = df.drop('GENDER',axis=1)

In [27]:
df.head()

Unnamed: 0,PROGRAM,RACE,RELIGION,STATE,INCOME,SPM ADDMATH,SPM PHY,SPM CHEM,SPM BIO,SEM1 CGPA,SEM1 MATH,SEM1 PHY,SEM1 CHEM,SEM1 BIO,SEM2 CGPA
0,1,3,2,1,6,6,5,5,3,1,2,1,1,2,1
1,1,3,2,1,1,5,4,6,5,1,1,4,2,5,1
2,1,3,2,1,7,7,2,4,5,1,4,1,3,3,1
3,1,3,2,8,2,5,3,6,4,2,3,5,4,1,2
4,1,3,2,1,3,3,4,6,5,1,1,5,3,2,1


In [28]:
df.shape

(1241, 15)

In [29]:
X = df.drop('SEM2 CGPA',axis=1)
y = df['SEM2 CGPA']

# STANDARDIZING THE VARIABLE

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()
scaler.fit(df.drop('SEM2 CGPA',axis=1))

StandardScaler()

In [32]:
scaled_features = scaler.transform(df.drop('SEM2 CGPA',axis=1))

In [33]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

Unnamed: 0,PROGRAM,RACE,RELIGION,STATE,INCOME,SPM ADDMATH,SPM PHY,SPM CHEM,SPM BIO,SEM1 CGPA,SEM1 MATH,SEM1 PHY,SEM1 CHEM,SEM1 BIO
0,0.987985,0.380774,0.268071,-0.48464,1.337215,0.835125,1.609341,0.862124,0.338659,0.340897,0.065149,-0.532248,-0.428638,0.232454
1,0.987985,0.380774,0.268071,-0.48464,-0.841157,0.403116,0.929098,1.355846,1.664452,0.340897,-0.581651,1.317948,0.177907,2.217381
2,0.987985,0.380774,0.268071,-0.48464,1.77289,1.267134,-0.431387,0.368402,1.664452,0.340897,1.35875,-0.532248,0.784451,0.894097
3,0.987985,0.380774,0.268071,1.660072,-0.405483,0.403116,0.248856,1.355846,1.001556,1.596246,0.71195,1.93468,1.390996,-0.429188
4,0.987985,0.380774,0.268071,-0.48464,0.030192,-0.460903,0.929098,1.355846,1.664452,0.340897,-0.581651,1.93468,0.784451,0.232454


# HOLD OUT VALIDATION (TRAIN TEST SPLIT)

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train,X_test,y_train,y_test = train_test_split(df_feat,df['SEM2 CGPA'],test_size=0.2,random_state=0)

In [36]:
X_train.head()

Unnamed: 0,PROGRAM,RACE,RELIGION,STATE,INCOME,SPM ADDMATH,SPM PHY,SPM CHEM,SPM BIO,SEM1 CGPA,SEM1 MATH,SEM1 PHY,SEM1 CHEM,SEM1 BIO
552,0.987985,0.380774,0.268071,-0.48464,-0.405483,-0.028893,1.609341,1.355846,1.664452,0.340897,-0.581651,0.084484,0.784451,1.555739
1116,-1.012161,0.380774,0.268071,-0.48464,-0.841157,-0.460903,0.929098,-0.12532,-0.324237,1.596246,1.35875,1.93468,1.390996,0.894097
492,0.987985,0.380774,0.268071,-0.48464,-1.276832,0.835125,0.248856,0.862124,1.001556,-0.914452,-0.581651,-1.14898,-1.035182,0.232454
947,-1.012161,0.380774,0.268071,-0.48464,-1.276832,0.403116,0.248856,0.368402,-0.324237,1.596246,2.652351,1.317948,2.604084,0.232454
1162,-1.012161,0.380774,0.268071,-0.48464,-0.405483,-0.028893,0.929098,0.368402,0.338659,0.340897,0.71195,0.701216,0.177907,0.894097


In [37]:
y_train

552     1
1116    2
492     0
947     2
1162    0
       ..
763     1
835     1
1216    0
559     0
684     0
Name: SEM2 CGPA, Length: 992, dtype: int64

# MODEL FITTING

In [38]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import classification_report

import warnings # to hide unnecesary warning
warnings.filterwarnings('ignore')

In [39]:
svc = SVC()
svc.fit(X_train,y_train)

SVC()

In [40]:
y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)

0.8554216867469879

# EVALUATION 1

In [41]:
print(confusion_matrix(y_test,y_pred))

[[145  11   0   0]
 [ 15  59   3   0]
 [  0   6   9   0]
 [  0   0   1   0]]


In [42]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       156
           1       0.78      0.77      0.77        77
           2       0.69      0.60      0.64        15
           3       0.00      0.00      0.00         1

    accuracy                           0.86       249
   macro avg       0.59      0.57      0.58       249
weighted avg       0.85      0.86      0.85       249



# HYPERPARAMETER TUNING (GRID SEARCH CV)

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
params = [{'C':[0.001,0.01,0.1,1,10,100],'kernel':['linear']},
          {'C':[0.001,0.01,0.1,1,10,100],'kernel':['poly','rbf','sigmoid'],
           'gamma':[0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,100]}]
           
grid_search = GridSearchCV(estimator=svc,param_grid=params,scoring='accuracy',cv=10,n_jobs=-1)

grid_search = grid_search.fit(X_train,y_train)

In [52]:
print(grid_search.best_params_)

{'C': 10, 'gamma': 0.01, 'kernel': 'sigmoid'}


# EVALUATION 2

In [53]:
svc = SVC(kernel='sigmoid',C=10, gamma=0.01)
svc.fit(X_train,y_train)

SVC(C=10, gamma=0.01, kernel='sigmoid')

In [54]:
y_pred = svc.predict(X_test)
accuracy_score(y_test,y_pred)

0.8755020080321285

In [57]:
print('SVM WITH C=10, gamma=0.01,kernel=sigmoid (TARGET : SEM2 CGPA)')
print('\n')
print(confusion_matrix(y_test,y_pred))
print('\n')
print(classification_report(y_test,y_pred))

SVM WITH C=10, gamma=0.01,kernel=sigmoid (TARGET : SEM2 CGPA)


[[150   6   0   0]
 [ 14  59   4   0]
 [  0   6   9   0]
 [  0   0   1   0]]


              precision    recall  f1-score   support

           0       0.91      0.96      0.94       156
           1       0.83      0.77      0.80        77
           2       0.64      0.60      0.62        15
           3       0.00      0.00      0.00         1

    accuracy                           0.88       249
   macro avg       0.60      0.58      0.59       249
weighted avg       0.87      0.88      0.87       249

