# Model 1: kNN

## Import the libraries and cleaning of the dataset

In [1]:
### Importing the libraries
import numpy as np
import pandas as pd


#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config


from sklearn.neighbors import KNeighborsClassifier


#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

In [2]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
train_df.head()

Unnamed: 0,id,occ_code_level2,age,stock_dividends,mig_chg_msa,tax_filer_stat,det_hh_summ,mig_prev_sunbelt,hisp_origin,education,wage_per_hour,capital_losses,vet_question,own_or_self,country_self,mig_move_reg,high_income,hs_college,class_worker,mig_same,unemp_reason,state_prev_res,ind_code_level2,race,country_mother,capital_gains,sex,ind_code_level1,citizenship,union_member,fam_under_18,marital_stat,region_prev_res,mig_chg_reg,country_father,occ_code_level1,full_or_part_emp,weeks_worked,det_hh_fam_stat,num_emp,vet_benefits
0,1,0,42.0,0.0,,Nonfiler,Householder,,All other,11th grade,0.0,,Not in universe,0,United-States,,0,Not in universe,Not in universe,Not in universe under 1 year old,Not in universe,Not in universe,0,Black,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Not in universe,,United-States,Not in universe,Not in labor force,0.0,Householder,0,2
1,2,18,56.0,,,,Householder,,All other,High school graduate,0.0,,Not in universe,2,United-States,,1,Not in universe,Self-employed-incorporated,Not in universe under 1 year old,,Not in universe,32,,United-States,,Male,Wholesale trade,Native- Born in the United States,Not in universe,Not in universe,Married-civilian spouse present,Not in universe,,United-States,Sales,Full-time schedules,,Householder,1,2
2,3,26,26.0,,,Joint both under 65,Householder,,All other,High school graduate,0.0,,Not in universe,0,Haiti,,0,Not in universe,Private,Not in universe under 1 year old,,Not in universe,41,,Haiti,,,Hospital services,Foreign born- Not a citizen of U S,Not in universe,Not in universe,,Not in universe,,Haiti,Adm support including clerical,Full-time schedules,,Householder,3,2
3,4,0,67.0,,MSA to MSA,Joint one under 65 & one 65+,Householder,No,All other,,0.0,,No,0,United-States,Same county,0,Not in universe,Not in universe,No,,North Carolina,0,Black,United-States,0.0,,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Northeast,Same county,United-States,Not in universe,Children or Armed Forces,0.0,Householder,0,1
4,5,0,,,Nonmover,Nonfiler,Child under 18 never married,Not in universe,All other,Children,0.0,0.0,Not in universe,0,United-States,Nonmover,0,Not in universe,Not in universe,Yes,Not in universe,Not in universe,0,White,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Mother only present,,Not in universe,Nonmover,United-States,Not in universe,Children or Armed Forces,,Child <18 never marr not in subfamily,0,0


In [3]:
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [4]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [5]:
X= train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [6]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## GridSearch

In [7]:
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier())]
)



#clf.fit(X, y)
from sklearn import set_config

set_config(display="diagram")
clf

In [8]:
hyper_param = {"classifier__n_neighbors":np.arange(1,50,2)}
Knn_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [9]:
Knn_pipe_cv.fit(X,y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [10]:
cv_res = pd.DataFrame(Knn_pipe_cv.cv_results_)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.955365,0.036417,58.538998,1.612102,1,{'classifier__n_neighbors': 1},0.793902,0.797656,0.79837,0.794707,0.79196,0.795319,0.002385,25
1,0.824368,0.158401,55.331993,8.339438,3,{'classifier__n_neighbors': 3},0.81789,0.819355,0.819797,0.818973,0.81357,0.817917,0.002263,24
2,0.647973,0.040271,48.961937,0.525642,5,{'classifier__n_neighbors': 5},0.832631,0.828694,0.829503,0.834447,0.826023,0.83026,0.002971,23
3,0.601783,0.090151,43.984841,2.757228,7,{'classifier__n_neighbors': 7},0.836477,0.831441,0.836004,0.836279,0.828312,0.833703,0.00328,22
4,0.558113,0.021283,42.221515,0.747701,9,{'classifier__n_neighbors': 9},0.835378,0.836294,0.835088,0.836279,0.83344,0.835296,0.001045,21
5,0.552386,0.01966,43.432546,1.349194,11,{'classifier__n_neighbors': 11},0.834829,0.835378,0.835912,0.839484,0.832524,0.835625,0.002249,20
6,0.623501,0.127125,48.403224,3.697625,13,{'classifier__n_neighbors': 13},0.836935,0.836477,0.83399,0.840216,0.834173,0.836358,0.002263,19
7,0.572551,0.015096,53.279276,0.812298,15,{'classifier__n_neighbors': 15},0.839315,0.838674,0.837744,0.83985,0.833898,0.837896,0.002119,18
8,0.785192,0.113517,68.424587,2.459225,17,{'classifier__n_neighbors': 17},0.840597,0.83959,0.837561,0.840125,0.83518,0.83861,0.002003,17
9,0.799977,0.254639,69.113404,2.469016,19,{'classifier__n_neighbors': 19},0.839407,0.842062,0.839575,0.841132,0.836553,0.839746,0.001877,15


In [11]:
mean_scores = Knn_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = Knn_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[cv_res["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,0.931799,0.162735,93.224938,9.525185,27,{'classifier__n_neighbors': 27},0.841329,0.843527,0.838385,0.843055,0.840399,0.841339,0.001862,6
