#Importing Packages and Data

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from numpy import mean
from numpy import std
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import RepeatedKFold, cross_val_score, RepeatedStratifiedKFold
import statsmodels.discrete.discrete_model as sm
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

In [None]:
communities_and_crime = fetch_ucirepo(id=183)

X = communities_and_crime.data.features
y = communities_and_crime.data.targets

In [None]:
groups_df = X.join(y)

In [None]:
groups_df = groups_df.replace('?', np.nan)

In [None]:
groups_df["CrimeCategory"] = pd.cut(x=groups_df['ViolentCrimesPerPop'],
                     bins=[-0.1, 0.25, 0.40, 1],
                     labels=['Low', 'Medium', 'High'])

In [None]:
#I am splitting the attributes into predetermined categories based on their real-world relationship to each other.
race_make_up = groups_df[["racePctWhite", "racepctblack", "racePctAsian", "racePctHisp", "CrimeCategory"]]
age = groups_df[["agePct12t21", "agePct12t29", "agePct16t24", "agePct65up", "CrimeCategory"]]
urban_factors = groups_df[["numbUrban", "pctUrban", "CrimeCategory"]]
income_factors = groups_df[["medIncome", "pctWWage", "pctWFarmSelf", "pctWInvInc", "pctWSocSec", "pctWPubAsst", "pctWRetire", "medFamInc", "perCapInc", \
                            "whitePerCap", "blackPerCap", "indianPerCap", "AsianPerCap", "OtherPerCap", "HispPerCap", "CrimeCategory"]]
poverty = groups_df[["NumUnderPov", "PctPopUnderPov", "CrimeCategory"]]
education = groups_df[["PctLess9thGrade", "PctNotHSGrad", "PctBSorMore", "CrimeCategory"]]
employment = groups_df[["PctUnemployed", "PctEmploy", "PctEmplManu", "PctEmplProfServ", "PctOccupManu", "PctOccupMgmtProf", "CrimeCategory"]]
marriage_factors = groups_df[["MalePctDivorce", "MalePctNevMarr", "FemalePctDiv", "TotalPctDiv", "CrimeCategory"]]
domestic_factors = groups_df[["PersPerFam", "PctFam2Par", "PctKids2Par", "PctYoungKids2Par", "PctTeen2Par", "PctWorkMomYoungKids", "PctWorkMom", "NumIlleg",\
                              "PctIlleg", "CrimeCategory"]]
immigration = groups_df[["PctImmigRecent", "PctImmigRec5", "PctImmigRec8", "PctImmigRec10", "PctRecentImmig", "PctRecImmig5", "PctRecImmig8", "PctRecImmig10", \
                         "CrimeCategory"]]
english_proficiency = groups_df[["PctSpeakEnglOnly", "PctNotSpeakEnglWell", "CrimeCategory"]]
housing_factors = groups_df[["PctLargHouseOccup", "PersPerOccupHous", "PersPerOwnOccHous", "PersPerRentOccHous", "PctPersOwnOccup", "PctPersDenseHous", \
                             "HousVacant", "PctHousOccup", "PctHousOwnOcc", "PctVacantBoarded", "PctVacMore6Mos", "OwnOccLowQuart", "OwnOccMedVal", \
                             "OwnOccHiQuart", "PctHousLess3BR", "MedNumBR", "MedYrHousBuilt", "PctHousNoPhone", "PctWOFullPlumb", "CrimeCategory"]]
housing_cost = groups_df[["RentLowQ", "RentMedian", "RentHighQ", "MedRent", "MedRentPctHousInc", "MedOwnCostPctInc", "MedOwnCostPctIncNoMtg", \
                          "CrimeCategory"]]
unhoused = groups_df[["NumInShelters", "NumStreet", "CrimeCategory"]]
mobility = groups_df[["PctForeignBorn", "PctBornSameState", "PctSameHouse85", "PctSameCity85", "PctSameState85", "CrimeCategory"]]
policing = groups_df[["LemasSwornFT", "LemasSwFTPerPop", "LemasSwFTFieldOps", "LemasSwFTFieldPerPop", "LemasTotalReq", "LemasTotReqPerPop", \
                      "PolicReqPerOffic", "PolicPerPop", "OfficAssgnDrugUnits", "NumKindsDrugsSeiz", "PolicAveOTWorked", "PolicCars", "PolicOperBudg", \
                      "LemasPctPolicOnPatr", "LemasGangUnitDeploy", "LemasPctOfficDrugUn", "PolicBudgPerPop", "RacialMatchCommPol", "PctPolicWhite", \
                      "PctPolicBlack", "PctPolicHisp", "PctPolicAsian", "PctPolicMinor", "CrimeCategory"]]
density = groups_df[["LandArea", "PopDens", "CrimeCategory"]]
transit = groups_df[["PctUsePubTrans", "CrimeCategory"]]

# Determining Optimal Number of Neighbors for Each Group in KNN Model Training

In [None]:
race_make_up_features = race_make_up.iloc[:, :-1]
race_make_up_target = race_make_up.iloc[:, -1]
X_train_1, X_test_1, Y_train_1, Y_test_1 = train_test_split(race_make_up_features, race_make_up_target, test_size=0.2, random_state=1)

f1_1 = []
for i in range(1,40):
    neigh_1 = KNeighborsClassifier(n_neighbors = i).fit(X_train_1,Y_train_1)
    yhat_1 = neigh_1.predict(X_test_1)
    f1_1.append(metrics.f1_score(Y_test_1, yhat_1, average='macro'))
print("Maximum f1:-",max(f1_1),"at K =",f1_1.index(max(f1_1))+1)

MCC_1 = []
for i in range (1,40):
    neigh_1 = KNeighborsClassifier(n_neighbors = i).fit(X_train_1,Y_train_1)
    yhat_1 = neigh_1.predict(X_test_1)
    MCC_1.append(matthews_corrcoef(Y_test_1, yhat_1))
print("Maximum MCC:-",max(MCC_1),"at K =",MCC_1.index(max(MCC_1))+1)


Maximum f1:- 0.521117446106566 at K = 34
Maximum MCC:- 0.38559050805530404 at K = 15


In [None]:
#Since our metrics are not aligned, we will check the value of f1 at K=15 and MCC at 34. Since f1 decreases the least, we will choose the value of K=14
print(f1_1[14])
print(MCC_1[33])

0.5202506385696041
0.3789819840980904


In [None]:
age_features = age.iloc[:, :-1]
age_target = age.iloc[:, -1]
X_train_2, X_test_2, Y_train_2, Y_test_2 = train_test_split(age_features, age_target, test_size=0.2, random_state=1)

f1_2 = []
for i in range(1,40):
    neigh_2 = KNeighborsClassifier(n_neighbors = i).fit(X_train_2,Y_train_2)
    yhat_2 = neigh_2.predict(X_test_2)
    f1_2.append(metrics.f1_score(Y_test_2, yhat_2, average='macro'))
print("Maximum f1:-",max(f1_2),"at K =",f1_2.index(max(f1_2))+1)

MCC_2 = []
for i in range (1,40):
    neigh_2 = KNeighborsClassifier(n_neighbors = i).fit(X_train_2,Y_train_2)
    yhat_2 = neigh_2.predict(X_test_2)
    MCC_2.append(matthews_corrcoef(Y_test_2, yhat_2))
print("Maximum MCC:-",max(MCC_2),"at K =",MCC_2.index(max(MCC_2))+1)


Maximum f1:- 0.4283741175406801 at K = 1
Maximum MCC:- 0.2329429912558172 at K = 15


In [None]:
#Based on the values of of f1, MCC, and the error, f1 decreased the least when at K=15 and we will therefore choose K=15.
print(f1_2[14])
print(MCC_2[0])


0.4189576375014616
0.20980974480550008


In [None]:
urban_factors_features = urban_factors.iloc[:, :-1]
urban_factors_target = urban_factors.iloc[:, -1]
X_train_3, X_test_3, Y_train_3, Y_test_3 = train_test_split(urban_factors_features, urban_factors_target, test_size=0.2, random_state=1)

f1_3 = []
for i in range(1,40):
    neigh_3 = KNeighborsClassifier(n_neighbors = i).fit(X_train_3,Y_train_3)
    yhat_3 = neigh_3.predict(X_test_3)
    f1_3.append(metrics.f1_score(Y_test_3, yhat_3, average='macro'))
print("Maximum f1:-",max(f1_3),"at K =",f1_3.index(max(f1_3))+1)

MCC_3 = []
for i in range (1,40):
    neigh = KNeighborsClassifier(n_neighbors = i).fit(X_train_3,Y_train_3)
    yhat_3 = neigh.predict(X_test_3)
    MCC_3.append(matthews_corrcoef(Y_test_3, yhat_3))
print("Maximum MCC:-",max(MCC_3),"at K =",MCC_3.index(max(MCC_3))+1)

Maximum f1:- 0.43059952790438166 at K = 1
Maximum MCC:- 0.29171309092878245 at K = 20


In [None]:
#One again, f1 decreases the least and so we should choose K=20
print(f1_3[19])
print(MCC_3[0])

0.42882258170239046
0.19555178465858059


In [None]:
#After my initial code threw an error I investigated this group and saw that there was one NA value I have to deal with.
income_factors.isna().sum()

medIncome        0
pctWWage         0
pctWFarmSelf     0
pctWInvInc       0
pctWSocSec       0
pctWPubAsst      0
pctWRetire       0
medFamInc        0
perCapInc        0
whitePerCap      0
blackPerCap      0
indianPerCap     0
AsianPerCap      0
OtherPerCap      0
HispPerCap       0
CrimeCategory    0
dtype: int64

In [None]:
#I chose to fill it with a value from the exisiting attribute as replacing one value shouldn't have a large impact on the model training.
income_factors.fillna(income_factors.select_dtypes(include='object').mode().iloc[0], inplace=True)
income_factors.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_factors.fillna(income_factors.select_dtypes(include='object').mode().iloc[0], inplace=True)


medIncome        0
pctWWage         0
pctWFarmSelf     0
pctWInvInc       0
pctWSocSec       0
pctWPubAsst      0
pctWRetire       0
medFamInc        0
perCapInc        0
whitePerCap      0
blackPerCap      0
indianPerCap     0
AsianPerCap      0
OtherPerCap      0
HispPerCap       0
CrimeCategory    0
dtype: int64

In [None]:
income_factors_features = income_factors.iloc[:, :-1]
income_factors_target = income_factors.iloc[:, -1]
X_train_4, X_test_4, Y_train_4, Y_test_4 = train_test_split(income_factors_features, income_factors_target, test_size=0.2, random_state=1)

f1_4 = []
for i in range(1,40):
    neigh_4 = KNeighborsClassifier(n_neighbors = i).fit(X_train_4,Y_train_4)
    yhat_4 = neigh_4.predict(X_test_4)
    f1_4.append(metrics.f1_score(Y_test_4, yhat_4, average='macro'))
print("Maximum f1:-",max(f1_4),"at K =",f1_4.index(max(f1_4))+1)

MCC_4 = []
for i in range (1,40):
    neigh_4 = KNeighborsClassifier(n_neighbors = i).fit(X_train_4,Y_train_4)
    yhat_4 = neigh_4.predict(X_test_4)
    MCC_4.append(matthews_corrcoef(Y_test_4, yhat_4))
print("Maximum MCC:-",max(MCC_4),"at K =",MCC_4.index(max(MCC_4))+1)

Maximum f1:- 0.5415680776446296 at K = 3
Maximum MCC:- 0.4083493151020328 at K = 5


In [None]:
#Since MCC changed the least, we will choose K=3.
print(f1_4[4])
print(MCC_4[2])

0.5108225108225108
0.3979369589985626


In [None]:
poverty_features = poverty.iloc[:, :-1]
poverty_target = poverty.iloc[:, -1]
X_train_5, X_test_5, Y_train_5, Y_test_5 = train_test_split(poverty_features, poverty_target, test_size=0.2, random_state=1)

f1_5 = []
for i in range(1,40):
    neigh_5 = KNeighborsClassifier(n_neighbors = i).fit(X_train_5, Y_train_5)
    yhat_5 = neigh_5.predict(X_test_5)
    f1_5.append(metrics.f1_score(Y_test_5, yhat_5, average='macro'))
print("Maximum f1:-",max(f1_5),"at K =",f1_5.index(max(f1_5))+1)

MCC_5 = []
for i in range (1,40):
    neigh_5 = KNeighborsClassifier(n_neighbors = i).fit(X_train_5,Y_train_5)
    yhat_5 = neigh_5.predict(X_test_5)
    MCC_5.append(matthews_corrcoef(Y_test_5, yhat_5))
print("Maximum MCC:-",max(MCC_5),"at K =",MCC_5.index(max(MCC_5))+1)

Maximum f1:- 0.5068549485940791 at K = 15
Maximum MCC:- 0.40580141205424297 at K = 38


In [None]:
#f1 decreased the least and therefore we should choose K=38
print(f1_5[37])
print(MCC_5[14])

0.48571069656428617
0.3788618865443606


In [None]:
education_features = education.iloc[:, :-1]
education_target = education.iloc[:, -1]
X_train_6, X_test_6, Y_train_6, Y_test_6 = train_test_split(education_features, education_target, test_size=0.2, random_state=1)

f1_6 = []
for i in range(1,40):
    neigh_6 = KNeighborsClassifier(n_neighbors = i).fit(X_train_6, Y_train_6)
    yhat_6 = neigh_6.predict(X_test_6)
    f1_6.append(metrics.f1_score(Y_test_6, yhat_6, average='macro'))
print("Maximum f1:-",max(f1_6),"at K =",f1_6.index(max(f1_6))+1)

MCC_6 = []
for i in range (1,40):
    neigh_6 = KNeighborsClassifier(n_neighbors = i).fit(X_train_6,Y_train_6)
    yhat_6 = neigh_6.predict(X_test_6)
    MCC_6.append(matthews_corrcoef(Y_test_6, yhat_6))
print("Maximum MCC:-",max(MCC_6),"at K =",MCC_6.index(max(MCC_6))+1)

Maximum f1:- 0.4617581063839149 at K = 4
Maximum MCC:- 0.282575488666128 at K = 37


In [None]:
#MCC decreased the least and therefore we will use K=4
print(f1_6[36])
print(MCC_6[3])

0.42394487510766576
0.2709155728783489


In [None]:
employment_features = employment.iloc[:, :-1]
employment_target = employment.iloc[:, -1]
X_train_7, X_test_7, Y_train_7, Y_test_7 = train_test_split(employment_features, employment_target, test_size=0.2, random_state=1)

f1_7 = []
for i in range(1,40):
    neigh_7 = KNeighborsClassifier(n_neighbors = i).fit(X_train_7, Y_train_7)
    yhat_7 = neigh_7.predict(X_test_7)
    f1_7.append(metrics.f1_score(Y_test_7, yhat_7, average='macro'))
print("Maximum f1:-",max(f1_7),"at K =",f1_7.index(max(f1_7))+1)

MCC_7 = []
for i in range (1,40):
    neigh_7 = KNeighborsClassifier(n_neighbors = i).fit(X_train_7,Y_train_7)
    yhat_7 = neigh_7.predict(X_test_7)
    MCC_7.append(matthews_corrcoef(Y_test_7, yhat_7))
print("Maximum MCC:-",max(MCC_7),"at K =",MCC_7.index(max(MCC_7))+1)


Maximum f1:- 0.4448470209339775 at K = 10
Maximum MCC:- 0.29096995493742384 at K = 30


In [None]:
#f1 decreased the least and therefore we will use K=30
print(f1_7[29])
print(MCC_7[9])

0.4363815959528821
0.24761084693966026


In [None]:
marriage_factors_features = marriage_factors.iloc[:, :-1]
marriage_factors_target = marriage_factors.iloc[:, -1]
X_train_8, X_test_8, Y_train_8, Y_test_8 = train_test_split(marriage_factors_features, marriage_factors_target, test_size=0.2, random_state=1)
f1_8 = []
for i in range(1,40):
    neigh_8 = KNeighborsClassifier(n_neighbors = i).fit(X_train_8, Y_train_8)
    yhat_8 = neigh_8.predict(X_test_8)
    f1_8.append(metrics.f1_score(Y_test_8, yhat_8, average='macro'))
print("Maximum f1:-",max(f1_8),"at K =",f1_8.index(max(f1_8))+1)

MCC_8 = []
for i in range (1,40):
    neigh_8 = KNeighborsClassifier(n_neighbors = i).fit(X_train_8,Y_train_8)
    yhat_8 = neigh_8.predict(X_test_8)
    MCC_8.append(matthews_corrcoef(Y_test_8, yhat_8))
print("Maximum MCC:-",max(MCC_8),"at K =",MCC_8.index(max(MCC_8))+1)

Maximum f1:- 0.5073006473253198 at K = 7
Maximum MCC:- 0.37001572975359515 at K = 20


In [None]:
#f1 decreased the least so we will use K=20
print(f1_8[19])
print(MCC_8[6])

0.4834759877721419
0.34829425735456354


In [None]:
#Since the two evaluation metrics are aligned, we will use K=9.
domestic_factors_features = domestic_factors.iloc[:, :-1]
domestic_factors_target = domestic_factors.iloc[:, -1]
X_train_9, X_test_9, Y_train_9, Y_test_9 = train_test_split(domestic_factors_features, domestic_factors_target, test_size=0.2, random_state=1)

f1_9 = []
for i in range(1,40):
    neigh_9 = KNeighborsClassifier(n_neighbors = i).fit(X_train_9, Y_train_9)
    yhat_9 = neigh_9.predict(X_test_9)
    f1_9.append(metrics.f1_score(Y_test_9, yhat_9, average='macro'))
print("Maximum f1:-",max(f1_9),"at K =",f1_9.index(max(f1_9))+1)

MCC_9 = []
for i in range (1,40):
    neigh_9 = KNeighborsClassifier(n_neighbors = i).fit(X_train_9,Y_train_9)
    yhat_9 = neigh_9.predict(X_test_9)
    MCC_9.append(matthews_corrcoef(Y_test_9, yhat_9))
print("Maximum MCC:-",max(MCC_9),"at K =",MCC_9.index(max(MCC_9))+1)

Maximum f1:- 0.5707643237204927 at K = 9
Maximum MCC:- 0.47572260642453373 at K = 9


In [None]:
#Since the metrics are aligned, we will use K=3
immigration_features = immigration.iloc[:, :-1]
immigration_target = immigration.iloc[:, -1]
X_train_10, X_test_10, Y_train_10, Y_test_10 = train_test_split(immigration_features, immigration_target, test_size=0.2, random_state=1)

f1_10 = []
for i in range(1,40):
    neigh_10 = KNeighborsClassifier(n_neighbors = i).fit(X_train_10, Y_train_10)
    yhat_10 = neigh_10.predict(X_test_10)
    f1_10.append(metrics.f1_score(Y_test_10, yhat_10, average='macro'))
print("Maximum f1:-",max(f1_10),"at K =",f1_10.index(max(f1_10))+1)

MCC_10 = []
for i in range (1,40):
    neigh_10 = KNeighborsClassifier(n_neighbors = i).fit(X_train_10,Y_train_10)
    yhat_10 = neigh_10.predict(X_test_10)
    MCC_10.append(matthews_corrcoef(Y_test_10, yhat_10))
print("Maximum MCC:-",max(MCC_10),"at K =",MCC_10.index(max(MCC_10))+1)

Maximum f1:- 0.44034400547332103 at K = 3
Maximum MCC:- 0.23009515476428155 at K = 3


In [None]:
#Since the metrics are aligned, we will use K=7
english_proficiency_features = english_proficiency.iloc[:, :-1]
english_proficiency_target = english_proficiency.iloc[:, -1]
X_train_11, X_test_11, Y_train_11, Y_test_11 = train_test_split(english_proficiency_features, english_proficiency_target, test_size=0.2, random_state=1)

f1_11 = []
for i in range(1,40):
    neigh_11 = KNeighborsClassifier(n_neighbors = i).fit(X_train_11, Y_train_11)
    yhat_11 = neigh_11.predict(X_test_11)
    f1_11.append(metrics.f1_score(Y_test_11, yhat_11, average='macro'))
print("Maximum f1:-",max(f1_11),"at K =",f1_11.index(max(f1_11))+1)

MCC_11 = []
for i in range (1,40):
    neigh_11 = KNeighborsClassifier(n_neighbors = i).fit(X_train_11,Y_train_11)
    yhat_11 = neigh_11.predict(X_test_11)
    MCC_11.append(matthews_corrcoef(Y_test_11, yhat_11))
print("Maximum MCC:-",max(MCC_11),"at K =",MCC_11.index(max(MCC_11))+1)

Maximum f1:- 0.42106876964448176 at K = 7
Maximum MCC:- 0.20002387427440038 at K = 7


In [None]:
#Since the metrics are aligned we will use K=12
housing_factors_features = housing_factors.iloc[:, :-1]
housing_factors_target = housing_factors.iloc[:, -1]
X_train_12, X_test_12, Y_train_12, Y_test_12 = train_test_split(housing_factors_features, housing_factors_target, test_size=0.2, random_state=1)

f1_12 = []
for i in range(1,40):
    neigh_12 = KNeighborsClassifier(n_neighbors = i).fit(X_train_12, Y_train_12)
    yhat_12 = neigh_12.predict(X_test_12)
    f1_12.append(metrics.f1_score(Y_test_12, yhat_12, average='macro'))
print("Maximum f1:-",max(f1_12),"at K =",f1_12.index(max(f1_12))+1)

MCC_12 = []
for i in range (1,40):
    neigh_12 = KNeighborsClassifier(n_neighbors = i).fit(X_train_12,Y_train_12)
    yhat_12 = neigh_12.predict(X_test_12)
    MCC_12.append(matthews_corrcoef(Y_test_12, yhat_12))
print("Maximum MCC:-",max(MCC_12),"at K =",MCC_12.index(max(MCC_12))+1)

Maximum f1:- 0.5753078756929847 at K = 12
Maximum MCC:- 0.47505750255066576 at K = 12


In [None]:
housing_cost_features = housing_cost.iloc[:, :-1]
housing_cost_target = housing_cost.iloc[:, -1]
X_train_13, X_test_13, Y_train_13, Y_test_13 = train_test_split(housing_cost_features, housing_cost_target, test_size=0.2, random_state=1)

f1_13 = []
for i in range(1,40):
    neigh_13 = KNeighborsClassifier(n_neighbors = i).fit(X_train_13, Y_train_13)
    yhat_13 = neigh_13.predict(X_test_13)
    f1_13.append(metrics.f1_score(Y_test_13, yhat_13, average='macro'))
print("Maximum f1:-",max(f1_13),"at K =",f1_13.index(max(f1_13))+1)

MCC_13 = []
for i in range (1,40):
    neigh_13 = KNeighborsClassifier(n_neighbors = i).fit(X_train_13,Y_train_13)
    yhat_13 = neigh_13.predict(X_test_13)
    MCC_13.append(matthews_corrcoef(Y_test_13, yhat_13))
print("Maximum MCC:-",max(MCC_13),"at K =",MCC_13.index(max(MCC_13))+1)

Maximum f1:- 0.4378404989405944 at K = 3
Maximum MCC:- 0.24098284559866462 at K = 21


In [None]:
#Since MCC decreased the least, we will use K=3
print(f1_13[20])
print(MCC_13[2])

0.4041276021654115
0.2346146708032485


In [None]:
unhoused_features = unhoused.iloc[:, :-1]
unhoused_target = unhoused.iloc[:, -1]
X_train_14, X_test_14, Y_train_14, Y_test_14 = train_test_split(unhoused_features, unhoused_target, test_size=0.2, random_state=1)

f1_14 = []
for i in range(1,40):
    neigh_14 = KNeighborsClassifier(n_neighbors = i).fit(X_train_14, Y_train_14)
    yhat_14 = neigh_14.predict(X_test_14)
    f1_14.append(metrics.f1_score(Y_test_14, yhat_14, average='macro'))
print("Maximum f1:-",max(f1_14),"at K =",f1_14.index(max(f1_14))+1)

MCC_14 = []
for i in range (1,40):
    neigh_14 = KNeighborsClassifier(n_neighbors = i).fit(X_train_14,Y_train_14)
    yhat_14 = neigh_14.predict(X_test_14)
    MCC_14.append(matthews_corrcoef(Y_test_14, yhat_14))
print("Maximum MCC:-",max(MCC_14),"at K =",MCC_14.index(max(MCC_14))+1)

Maximum f1:- 0.5163675500968486 at K = 9
Maximum MCC:- 0.3892612888492211 at K = 7


In [None]:
#MCC decreased the least so we will use K=9
print(f1_14[6])
print(MCC_14[8])

0.513540969595364
0.3880183407634416


In [None]:
#The metrics are aligned and we will use K=3
mobility_features = mobility.iloc[:, :-1]
mobility_target = mobility.iloc[:, -1]
X_train_15, X_test_15, Y_train_15, Y_test_15 = train_test_split(mobility_features, mobility_target, test_size=0.2, random_state=1)
f1_15 = []
for i in range(1,40):
    neigh_15 = KNeighborsClassifier(n_neighbors = i).fit(X_train_15, Y_train_15)
    yhat_15 = neigh_15.predict(X_test_15)
    f1_15.append(metrics.f1_score(Y_test_15, yhat_15, average='macro'))
print("Maximum f1:-",max(f1_15),"at K =",f1_15.index(max(f1_15))+1)

MCC_15 = []
for i in range (1,40):
    neigh_15 = KNeighborsClassifier(n_neighbors = i).fit(X_train_15,Y_train_15)
    yhat_15 = neigh_15.predict(X_test_15)
    MCC_15.append(matthews_corrcoef(Y_test_15, yhat_15))
print("Maximum MCC:-",max(MCC_15),"at K =",MCC_15.index(max(MCC_15))+1)

Maximum f1:- 0.44243961932989606 at K = 3
Maximum MCC:- 0.23520868734938197 at K = 3


In [None]:
density_features = density.iloc[:, :-1]
density_target = density.iloc[:, -1]
X_train_16, X_test_16, Y_train_16, Y_test_16 = train_test_split(density_features, density_target, test_size=0.2, random_state=1)
f1_16 = []
for i in range(1,40):
    neigh_16 = KNeighborsClassifier(n_neighbors = i).fit(X_train_16, Y_train_16)
    yhat_16 = neigh_16.predict(X_test_16)
    f1_16.append(metrics.f1_score(Y_test_16, yhat_16, average='macro'))
print("Maximum f1:-",max(f1_16),"at K =",f1_16.index(max(f1_16))+1)

MCC_16 = []
for i in range (1,40):
    neigh_16 = KNeighborsClassifier(n_neighbors = i).fit(X_train_16,Y_train_16)
    yhat_16 = neigh_16.predict(X_test_16)
    MCC_16.append(matthews_corrcoef(Y_test_16, yhat_16))
print("Maximum MCC:-",max(MCC_16),"at K =",MCC_16.index(max(MCC_16))+1)

Maximum f1:- 0.4396525215252152 at K = 1
Maximum MCC:- 0.2868789244207991 at K = 33


In [None]:
#f1 decreases the least and therefore will use K=33
print(f1_16[32])
print(MCC_16[0])

0.42304976277170914
0.1874849417740299


In [None]:
transit_features = transit.iloc[:, :-1]
transit_target = transit.iloc[:, -1]
X_train_17, X_test_17, Y_train_17, Y_test_17 = train_test_split(transit_features, transit_target, test_size=0.2, random_state=1)

f1_17 = []
for i in range(1,40):
    neigh_17 = KNeighborsClassifier(n_neighbors = i).fit(X_train_17, Y_train_17)
    yhat_17 = neigh_17.predict(X_test_17)
    f1_17.append(metrics.f1_score(Y_test_17, yhat_17, average='macro'))
print("Maximum f1:-",max(f1_17),"at K =",f1_17.index(max(f1_17))+1)

MCC_17 = []
for i in range (1,40):
    neigh_17 = KNeighborsClassifier(n_neighbors = i).fit(X_train_17,Y_train_17)
    yhat_17 = neigh_17.predict(X_test_17)
    MCC_17.append(matthews_corrcoef(Y_test_17, yhat_17))
print("Maximum MCC:-",max(MCC_17),"at K =",MCC_17.index(max(MCC_17))+1)

Maximum f1:- 0.380228525101797 at K = 1
Maximum MCC:- 0.13768026196224092 at K = 6


In [None]:
#Since f1 decreases the least, we will take K=6
print(f1_17[5])
print(MCC_17[0])

0.3535724026224129
0.07963707499213638


In [None]:
policing.isna().sum()

LemasSwornFT            1675
LemasSwFTPerPop         1675
LemasSwFTFieldOps       1675
LemasSwFTFieldPerPop    1675
LemasTotalReq           1675
LemasTotReqPerPop       1675
PolicReqPerOffic        1675
PolicPerPop             1675
OfficAssgnDrugUnits     1675
NumKindsDrugsSeiz       1675
PolicAveOTWorked        1675
PolicCars               1675
PolicOperBudg           1675
LemasPctPolicOnPatr     1675
LemasGangUnitDeploy     1675
LemasPctOfficDrugUn        0
PolicBudgPerPop         1675
RacialMatchCommPol      1675
PctPolicWhite           1675
PctPolicBlack           1675
PctPolicHisp            1675
PctPolicAsian           1675
PctPolicMinor           1675
CrimeCategory              0
dtype: int64

In [None]:
policing = policing.dropna()

In [None]:
policing.isna().sum()

LemasSwornFT            0
LemasSwFTPerPop         0
LemasSwFTFieldOps       0
LemasSwFTFieldPerPop    0
LemasTotalReq           0
LemasTotReqPerPop       0
PolicReqPerOffic        0
PolicPerPop             0
OfficAssgnDrugUnits     0
NumKindsDrugsSeiz       0
PolicAveOTWorked        0
PolicCars               0
PolicOperBudg           0
LemasPctPolicOnPatr     0
LemasGangUnitDeploy     0
LemasPctOfficDrugUn     0
PolicBudgPerPop         0
RacialMatchCommPol      0
PctPolicWhite           0
PctPolicBlack           0
PctPolicHisp            0
PctPolicAsian           0
PctPolicMinor           0
CrimeCategory           0
dtype: int64

In [None]:
policing_features = policing.iloc[:, :-1]
policing_target = policing.iloc[:, -1]
X_train_18, X_test_18, Y_train_18, Y_test_18 = train_test_split(policing_features, policing_target, test_size=0.2, random_state=1)

f1_18 = []
for i in range(1,40):
    neigh_18 = KNeighborsClassifier(n_neighbors = i).fit(X_train_18, Y_train_18)
    yhat_18 = neigh_18.predict(X_test_18)
    f1_18.append(metrics.f1_score(Y_test_18, yhat_18, average='macro'))
print("Maximum f1:-",max(f1_18),"at K =",f1_18.index(max(f1_18))+1)

MCC_18 = []
for i in range (1,40):
    neigh_18 = KNeighborsClassifier(n_neighbors = i).fit(X_train_18,Y_train_18)
    yhat_18 = neigh_18.predict(X_test_18)
    MCC_18.append(matthews_corrcoef(Y_test_18, yhat_18))
print("Maximum MCC:-",max(MCC_18),"at K =",MCC_18.index(max(MCC_18))+1)

Maximum f1:- 0.5378439267567827 at K = 16
Maximum MCC:- 0.45054236893837146 at K = 16
