In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy import stats

In [2]:
url = "../data/Train.csv"
df = pd.read_csv(url)
df.head()


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [3]:
df["Ever_Married"].unique()

array(['No', 'Yes', nan], dtype=object)

In [4]:
df.shape

(8068, 11)

In [5]:
df = df.drop(columns = ['Var_1', 'Segmentation', "ID"])

In [6]:
df.isnull().sum()

Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
dtype: int64

In [7]:
df.dropna(inplace = True)
df.shape

(6718, 8)

In [8]:
df["Graduated"].unique()

array(['No', 'Yes'], dtype=object)

In [9]:
df.isnull().sum()

Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
dtype: int64

In [10]:
df.duplicated().sum()

1135

In [11]:
# One-hot encoding for sex and title
df_dummies = pd.get_dummies(df, columns=["Profession"])
df_dummies

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,Male,No,22,No,1.0,Low,4.0,False,False,False,False,False,True,False,False,False
2,Female,Yes,67,Yes,1.0,Low,1.0,False,False,True,False,False,False,False,False,False
3,Male,Yes,67,Yes,0.0,High,2.0,False,False,False,False,False,False,False,True,False
5,Male,Yes,56,No,0.0,Average,2.0,True,False,False,False,False,False,False,False,False
6,Male,No,32,Yes,1.0,Low,3.0,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8062,Male,Yes,41,Yes,0.0,High,5.0,True,False,False,False,False,False,False,False,False
8064,Male,No,35,No,3.0,Low,4.0,False,False,False,False,True,False,False,False,False
8065,Female,No,33,Yes,1.0,Low,1.0,False,False,False,False,False,True,False,False,False
8066,Female,No,27,Yes,1.0,Low,4.0,False,False,False,False,False,True,False,False,False


In [12]:
gender_map = {"Female": True, "Male": False}
df_dummies["Gender"] = df_dummies["Gender"].map(gender_map)

In [13]:
def mapeo(df, column):

    map_total = {"Yes": True, "No": False}
    df[column] = df[column].map(map_total)
    return df

In [14]:
mapeo(df_dummies, "Ever_Married")
mapeo(df_dummies, "Graduated")

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing
0,False,False,22,False,1.0,Low,4.0,False,False,False,False,False,True,False,False,False
2,True,True,67,True,1.0,Low,1.0,False,False,True,False,False,False,False,False,False
3,False,True,67,True,0.0,High,2.0,False,False,False,False,False,False,False,True,False
5,False,True,56,False,0.0,Average,2.0,True,False,False,False,False,False,False,False,False
6,False,False,32,True,1.0,Low,3.0,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8062,False,True,41,True,0.0,High,5.0,True,False,False,False,False,False,False,False,False
8064,False,False,35,False,3.0,Low,4.0,False,False,False,False,True,False,False,False,False
8065,True,False,33,True,1.0,Low,1.0,False,False,False,False,False,True,False,False,False
8066,True,False,27,True,1.0,Low,4.0,False,False,False,False,False,True,False,False,False


In [15]:
features = df_dummies.drop('Spending_Score', axis=1)
target = df_dummies['Spending_Score']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
knn = KNeighborsClassifier(n_neighbors=8)

In [35]:
knn.fit(X_train, y_train)

In [36]:
pred = knn.predict(X_test) 
pred

array(['Low', 'High', 'Low', ..., 'Average', 'Low', 'Low'], dtype=object)

In [37]:
knn.score(X_test, y_test) 

0.7447916666666666