# Feature construction by KMeans

Cluster businesses, and cluster users. For each review, add the two cluster labels. 

The idea is that similar users will have similar rating habits, and similar businesses will receive similar ratings from users.

# Code

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.get_data import *
from IPython.display import clear_output
from sklearn.cluster import KMeans
from sklearn.linear_model import Ridge
from src.get_data import *
import ast
import time

In [3]:
%load_ext autoreload
%autoreload 2

# Fetch data, do clustering, and add feature(s)

In [37]:
b_data, u_data, reviews = get_training_data()
valid_queries = get_validation_reviews()

kmeans_b = KMeans(n_clusters=8)
kmeans_b.fit(b_data.values)
b_labels = kmeans_b.predict(b_data.values)

b_data.insert(loc=len(b_data.columns), column='KMeans_cluster_b', value=b_labels)
b_data = one_hot_encode(b_data, 'KMeans_cluster_b')

kmeans_u = KMeans(n_clusters=8)
kmeans_u.fit(u_data.values)
u_labels = kmeans_u.predict(u_data.values)

u_data.insert(loc=len(u_data.columns), column='KMeans_cluster_u', value=u_labels)
u_data = one_hot_encode(u_data, 'KMeans_cluster_u')

# Now construct design matrix

In [38]:
X_train, y_train = construct_design_matrix(
    b_data, u_data, reviews, return_df=True, verbose=True
)
X_valid, y_valid = construct_design_matrix(
    b_data, u_data, valid_queries, return_df=True, verbose=True
)

Constructing design matrix now.
0/150232 done
20000/150232 done
40000/150232 done
60000/150232 done
80000/150232 done
100000/150232 done
120000/150232 done
140000/150232 done
Finished!
Constructing design matrix now.
0/50077 done
20000/50077 done
40000/50077 done
Finished!


# Regression (ridge)

In [42]:
def rmse(y1, y2):
    return np.sqrt(np.mean((y1-y2)**2))

clf = Ridge(alpha=0.1)
clf.fit(X_train.values, y_train)

y_pred_train = clf.predict(X_train.values)
ypt_err = rmse(y_train, y_pred_train)

y_pred_valid = clf.predict(X_valid.values)
ypv_err = rmse(y_valid, y_pred_valid)

print('%f  <- training RMSE' % ypt_err)
print('%f  <- validation RMSE' % ypv_err)

1.005137  <- training RMSE
1.051989  <- validation RMSE
