In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(".", "transformers")))
from preprocessing_part_2 import * 

In [9]:
new_train_df = pd.read_csv('dataframes/train_geo_clustered.csv')
new_val_df = pd.read_csv('dataframes/val_geo_clustered.csv')

In [10]:
# we have a train df with a new column from first clusterization
new_train_df.head(2)

Unnamed: 0,name,latitude,longitude,review_count,rating,Friday,Saturday,Sunday,Monday,Tuesday,...,Wednesday_morning,Wednesday_afternoon,Wednesday_evening,Thursday_morning,Thursday_afternoon,Thursday_evening,Website_known,Phone_known,main_category,geo_cluster
0,Lead Handyman Services,25.324796,55.417346,-0.135231,1.061147,8-11 AM,8 AM-6 PM,8 AM-6 PM,8 AM-6 AM,8 AM-6 PM,...,0,0,0,0,0,0,1,1,Service,0
1,Sunil Ambalavelil - Lawyer and Legal Consultan...,25.18819,55.271341,-0.136699,1.061147,Hours not available,Hours not available,Hours not available,Hours not available,Hours not available,...,0,0,0,0,0,0,1,1,Attorney,0


# Feature engineering part 2

We still have some columns to delete:
* name - is unique for every company
* latitude and longitude - these features were used for first clustering, now we have geo_cluster so we don't need them anymore
* Friday,...Sunday - We already have binary columns like monday_morning/monday_evening etc

Column that we have to encode: main category

In [11]:
pipes1 = Pipeline([('remove_more_cols',MoreColumnsRemover()), # removing irrelevant columns mentioned above
                   ('encoder', CategoryEncoder()),    # one hot encoding 'main'category' column
                   ('bool_encode', BooleanToNumericEncoder())]) 
train_df = pipes1.fit_transform(new_train_df)




In [18]:
standardizer = Pipeline([('standardizer', SecondStandardizer())])
scaled=standardizer.fit_transform(train_df)
train_df = pd.DataFrame(scaled, columns=train_df.columns)


In [19]:
train_df.describe()

Unnamed: 0,review_count,rating,verified,Friday_morning,Friday_afternoon,Friday_evening,Saturday_morning,Saturday_afternoon,Saturday_evening,Sunday_morning,...,main_category_Office,main_category_Other,main_category_Park,main_category_Pharmacy,main_category_Restaurant,main_category_Service,main_category_Shop,main_category_Store,main_category_Supermarket,main_category_Tourist
count,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,...,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0,15039.0
mean,0.0,-1.8898670000000002e-17,1.8898670000000002e-17,2.26784e-17,2.26784e-17,2.26784e-17,1.5118940000000003e-17,1.5118940000000003e-17,1.5118940000000003e-17,9.827308e-17,...,-1.322907e-17,0.0,-1.13392e-17,1.8898670000000002e-17,3.968721e-17,-1.5118940000000003e-17,-3.779734e-18,6.520041e-17,-6.047574e-17,-7.370481e-17
std,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,...,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033,1.000033
min,-0.11909,-5.042421,-1.53816,-0.3712853,-0.3712853,-0.3712853,-0.3785531,-0.3785531,-0.3785531,-0.3819894,...,-0.08854813,-0.573997,-0.1655127,-0.1240724,-0.233727,-0.4893997,-0.2265187,-0.4262171,-0.1184258,-0.1918721
25%,-0.11735,-0.1984598,-1.53816,-0.3712853,-0.3712853,-0.3712853,-0.3785531,-0.3785531,-0.3785531,-0.3819894,...,-0.08854813,-0.573997,-0.1655127,-0.1240724,-0.233727,-0.4893997,-0.2265187,-0.4262171,-0.1184258,-0.1918721
50%,-0.111385,0.1042878,0.6501275,-0.3712853,-0.3712853,-0.3712853,-0.3785531,-0.3785531,-0.3785531,-0.3819894,...,-0.08854813,-0.573997,-0.1655127,-0.1240724,-0.233727,-0.4893997,-0.2265187,-0.4262171,-0.1184258,-0.1918721
75%,-0.104177,0.709783,0.6501275,-0.3712853,-0.3712853,-0.3712853,-0.3785531,-0.3785531,-0.3785531,-0.3819894,...,-0.08854813,-0.573997,-0.1655127,-0.1240724,-0.233727,-0.4893997,-0.2265187,-0.4262171,-0.1184258,-0.1918721
max,54.578494,1.012531,0.6501275,2.693347,2.693347,2.693347,2.641638,2.641638,2.641638,2.617874,...,11.29329,1.742168,6.041833,8.059809,4.278495,2.04332,4.414646,2.346222,8.444104,5.211806


At this point all of our features are numeric and standardized, so let's start some clustering for this data

# PCA

In [20]:
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

In [22]:
#let's see if we can achieve little information loss with only 2 components
pca = PCA(n_components=2)
principal_cmponents = pca.fit_transform(train_df)
print(pca.explained_variance_)

[20.59779486  1.56943742]


Explained variance measures the proportion of variance in the data that is explained by each principal component. We can see that 20% is very little so we will have to add more components

In [24]:
pca = PCA(.85)
principal_cmponents = pca.fit_transform(train_df)
print(pca.n_components_)

21


In order to have 85% of the variance explained we need 21 components. Our dataframe has 52 so that is less than half of the features that we would need withou pca

# Model 1

In [31]:
from sklearn.cluster import KMeans
train_data_reduced = pca.fit_transform(train_df)

inertia=[]
for k in range (1,11):
    model = KMeans(n_clusters=k)
    model.fit(train_data_reduced)
    inertia.append(model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
