In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt 

In [4]:
#importing the data

dataset = pd.read_csv("train.csv")

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#checking for null values in dataset

dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [6]:
#converting gender & embarked features as binary variables

label_encoder = preprocessing.LabelEncoder()

encoded_sex = label_encoder.fit_transform(dataset['Sex'])

encoded_embarked = label_encoder.fit_transform(dataset['Embarked'])

In [7]:
dataset['Sex'] = pd.DataFrame(encoded_sex)

dataset['Embarked'] = pd.DataFrame(encoded_embarked)

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [8]:
# Spitting dataset on basis of dependent & independent columns

y = dataset['Pclass']
x = dataset.drop(["Pclass","Name","Ticket","Cabin","PassengerId"],axis=1)

x.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,1,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,0,1,35.0,0,0,8.05,2


In [9]:
# spliting up test & train dataset

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=2)

In [10]:
X_train.head(2)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
396,0,1,46.0,0,0,26.0,2
451,0,1,30.0,0,0,27.75,0


In [11]:
X_test.head(2)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Embarked
559,0,1,45.0,0,0,7.75,1
37,0,1,21.0,0,0,8.05,2


In [12]:
#importing Knn Library

from sklearn import neighbors

#assigning k=3 initially

knn = neighbors.KNeighborsClassifier(n_neighbors=3)

#calculating the accuracy score

knn.fit(X_train,y_train).score(X_test,y_test)

0.8614232209737828

In [13]:
y_pred = knn.predict(X_test)

In [14]:
accuracy_score(y_test,y_pred,normalize=True)

0.8614232209737828

In [15]:
#confusion matrix

confusion_matrix(y_test,y_pred)

array([[ 58,   1,   1],
       [ 15,  44,   8],
       [  1,  11, 128]], dtype=int64)

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.78      0.97      0.87        60
           2       0.79      0.66      0.72        67
           3       0.93      0.91      0.92       140

    accuracy                           0.86       267
   macro avg       0.83      0.85      0.84       267
weighted avg       0.86      0.86      0.86       267



## Assignment - Finding accuracy score for different k values

In [17]:
accuracy_sc = []

for i in range(1,267):
    knn_a = neighbors.KNeighborsClassifier(n_neighbors=i)
    knn_a.fit(X_train,y_train).score(X_test,y_test)
    y_pred = knn_a.predict(X_test)
    print('Kvalue = ',i ,'Score = ',accuracy_score(y_test,y_pred,normalize=True))
    ascore = accuracy_score(y_test,y_pred,normalize=True)
    accuracy_sc.append(ascore)

Kvalue =  1 Score =  0.8764044943820225
Kvalue =  2 Score =  0.850187265917603
Kvalue =  3 Score =  0.8614232209737828
Kvalue =  4 Score =  0.8389513108614233
Kvalue =  5 Score =  0.850187265917603
Kvalue =  6 Score =  0.8239700374531835
Kvalue =  7 Score =  0.8389513108614233
Kvalue =  8 Score =  0.8314606741573034
Kvalue =  9 Score =  0.8277153558052435
Kvalue =  10 Score =  0.8277153558052435
Kvalue =  11 Score =  0.8164794007490637
Kvalue =  12 Score =  0.8202247191011236
Kvalue =  13 Score =  0.8164794007490637
Kvalue =  14 Score =  0.8089887640449438
Kvalue =  15 Score =  0.7940074906367042
Kvalue =  16 Score =  0.8052434456928839
Kvalue =  17 Score =  0.8014981273408239
Kvalue =  18 Score =  0.797752808988764
Kvalue =  19 Score =  0.7790262172284644
Kvalue =  20 Score =  0.7902621722846442
Kvalue =  21 Score =  0.7865168539325843
Kvalue =  22 Score =  0.7902621722846442
Kvalue =  23 Score =  0.7827715355805244
Kvalue =  24 Score =  0.7827715355805244
Kvalue =  25 Score =  0.7790

Kvalue =  199 Score =  0.651685393258427
Kvalue =  200 Score =  0.651685393258427
Kvalue =  201 Score =  0.651685393258427
Kvalue =  202 Score =  0.651685393258427
Kvalue =  203 Score =  0.651685393258427
Kvalue =  204 Score =  0.651685393258427
Kvalue =  205 Score =  0.651685393258427
Kvalue =  206 Score =  0.651685393258427
Kvalue =  207 Score =  0.651685393258427
Kvalue =  208 Score =  0.651685393258427
Kvalue =  209 Score =  0.6554307116104869
Kvalue =  210 Score =  0.6554307116104869
Kvalue =  211 Score =  0.6554307116104869
Kvalue =  212 Score =  0.6554307116104869
Kvalue =  213 Score =  0.6554307116104869
Kvalue =  214 Score =  0.6554307116104869
Kvalue =  215 Score =  0.651685393258427
Kvalue =  216 Score =  0.6554307116104869
Kvalue =  217 Score =  0.6554307116104869
Kvalue =  218 Score =  0.6554307116104869
Kvalue =  219 Score =  0.651685393258427
Kvalue =  220 Score =  0.651685393258427
Kvalue =  221 Score =  0.651685393258427
Kvalue =  222 Score =  0.651685393258427
Kvalue 