In [None]:
# To work with dataframes
import pandas as pd 

# To perform numerical operations
import numpy as np

# To visualize data
import seaborn as sns

# To disable warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Importing data
# =============================================================================
data = pd.read_csv('income.csv',na_values=[" ?"]) 

In [None]:
data

In [None]:
# =============================================================================
# Data pre-processing
# =============================================================================

data.isnull().sum()


### """ Points to note:
1. Missing values in Jobtype    = 1809
2. Missing values in Occupation = 1816 
3. There are 1809 rows where two specific 
   columns i.e. occupation & JobType have missing values
4. (1816-1809) = 7 => You still have occupation unfilled for 
   these 7 rows. Because, jobtype is Never worked
"""

In [None]:
missing = data[data.isnull().any(axis=1)]
# axis=1 => to consider at least one column value is missing in a row

In [None]:
missing

In [None]:
data2 = data.dropna(axis=0)

In [None]:
data

In [None]:
# Reindexing the salary status names to 0,1
data2['SalStat']=data2['SalStat'].map({' less than or equal to 50,000':0,' greater than 50,000':1})
print(data2['SalStat'])

In [None]:
new_data=pd.get_dummies(data2, drop_first=True)

In [None]:
# Storing the column names 
columns_list=list(new_data.columns)
print(columns_list)

In [None]:
# Separating the input names from data
features=list(set(columns_list)-set(['SalStat']))
print(features)

In [None]:
# Storing the output values in y
y=new_data['SalStat'].values
print(y)

In [None]:
# Storing the values from input features
x = new_data[features].values
print(x)

In [None]:
# To partition the data
from sklearn.model_selection import train_test_split
# Splitting the data into train and test
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3, random_state=0)

In [None]:
# =============================================================================
# KNN
# =============================================================================
# importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier
# Storing the K nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)  

# Fitting the values for X and Y
KNN_classifier.fit(train_x, train_y) 

# Predicting the test values with model
prediction = KNN_classifier.predict(test_x)


In [61]:
# Importing performance metrics - accuracy score & confusion matrix
from sklearn.metrics import accuracy_score,confusion_matrix

# Performance metric check
confusionMmatrix = confusion_matrix(test_y, prediction)
print(confusionMmatrix)

# Calculating the accuracy
accuracy_score=accuracy_score(test_y, prediction)
print(accuracy_score)

print('Misclassified samples: %d' % (test_y != prediction).sum())

"""
Effect of K value on classifier
"""
Misclassified_sample = []
# Calculating error for K values between 1 and 20
for i in range(1, 20):  
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    pred_i = knn.predict(test_x)
    Misclassified_sample.append((test_y != pred_i).sum())

print(Misclassified_sample)
# =============================================================================
# END OF SCRIPT
# =============================================================================

[1766, 1516, 1515, 1436, 1493, 1438, 1451, 1432, 1458, 1436, 1441, 1447, 1451, 1423, 1413, 1390, 1424, 1396, 1434]
