# Recommendations KMeans Test

## 💡 1. Introduction

## 📚 2. Preparations

### 2.1 Importing libraries 

In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit

### 2.2 Importing datasets

In [2]:
%store -r games ratings

### 2.3 Defining methods for testing

In [3]:
def avg_genre_matrix(ratings):
    # Split each game into multiple rows
    # for each genre they have, this makes
    # it easier to calculate the average score
    genre_ratings = pd.merge(games, ratings)
    genre_ratings = genre_ratings.explode(column='genres')

    # Calcluate average of each review_critic of each genre
    genre_ratings = genre_ratings.groupby(['review_critic', 'genres']).mean()
    genre_ratings = genre_ratings.reset_index().pivot('review_critic', 'genres', 'meta_score')
    return genre_ratings

In [4]:
def resolve_by_impute(genre_ratings):
    if IMPUTATION_RESOLVER == 'drop':
        genre_ratings = genre_ratings.dropna()
    elif IMPUTATION_RESOLVER == 'zeros':
        genre_ratings = genre_ratings.fillna(value=0)
    elif IMPUTATION_RESOLVER == 'neutral':
        genre_ratings = genre_ratings.fillna(value=50)
    elif IMPUTATION_RESOLVER == 'mean':
        genre_ratings = genre_ratings.fillna(genre_ratings.mean())
    return genre_ratings

In [5]:
def standardize(genre_ratings):
    SS = StandardScaler(with_mean=True, with_std=True)
    return SS.fit_transform(genre_ratings)

## ⚙️ 3. Settings

In [6]:
## Amount of clusters that the
# reviewers should be devided into
K_CLUSTERS = 8

## Score reduction strength for games
## with a low amount of reviews in cluster
PENALIZER_STRENGTH = 0

## How to resolve NaN values for 
## critics that haven't played a genre
## possible values: 'drop', 'mean', 'zeros' and 'neutral'
IMPUTATION_RESOLVER = 'neutral'

## Minimum amount of reviews
## that the game must have
MIN_REVIEWS_COUNT = 0

## Minimum amount of reviews that
## the critic must have written
MIN_REVIEWS_WRITTEN = 325

## Minimum amount of reviews in
## for a genre in order to be clustered
MIN_REVIEWS_IN_GENRE = 75

## Train to test ratio
TRAIN_TEST_SPLIT = 0.7

## ✅ 4. Evaluation

### 4.1 Train-Test split

In [7]:
gss = GroupShuffleSplit(n_splits=2, train_size=TRAIN_TEST_SPLIT, random_state=42)
gss

GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7)

In [8]:
train_idx, test_idx = next(gss.split(ratings, groups=ratings['review_critic']))
train_idx, test_idx

(array([     1,      3,      6, ..., 422470, 422471, 422473], dtype=int64),
 array([     0,      2,      4, ..., 422463, 422469, 422472], dtype=int64))

In [9]:
train = ratings.iloc[train_idx]
train_matrix = resolve_by_impute(avg_genre_matrix(train))

test = ratings.iloc[test_idx]
test_matrix = resolve_by_impute(avg_genre_matrix(test))

### 4.2 Training the model

In [10]:
clusterer = KMeans(n_clusters=K_CLUSTERS, random_state=42)
clusterer.fit(standardize(train_matrix))

KMeans(random_state=42)

### 4.3 Predictions

In [11]:
predictions = clusterer.predict(standardize(test_matrix))
predictions

array([1, 3, 4, 5, 3, 5, 5, 4, 7, 2, 0, 0, 1, 2, 1, 4, 1, 0, 5, 4, 3, 1,
       7, 1, 5, 2, 0, 1, 5, 1, 1, 1, 1, 4, 1, 6, 1, 1, 1, 0, 1, 1, 2, 7,
       3, 0, 7, 2, 2, 1, 5, 2, 1, 2, 0, 1, 4, 7, 7, 1, 7, 1, 0, 3, 1, 4,
       1, 4, 4, 5, 1, 3, 5, 0, 5, 1, 4, 1, 4, 1, 1, 7, 1, 1, 4, 7, 0, 2,
       5, 1, 2, 1, 1, 1, 1, 3, 1, 4, 3, 5, 4, 0, 2, 1, 0, 2, 1, 2, 2, 5,
       4, 7, 5, 4, 2, 2, 4, 0, 5, 1, 1, 5, 5, 4, 1, 5, 0, 0, 5, 2, 2, 1,
       5, 5, 2, 4, 2, 5, 4, 1, 4, 7, 1, 0, 4, 0, 0, 1, 0, 1])

In [12]:
test_matrix['cluster'] = predictions
test_matrix = test_matrix.reset_index()

test = test.merge(test_matrix[['review_critic', 'cluster']])
test = test[['title', 'platform', 'review_critic', 'cluster', 'meta_score']]
test.sample(n=20, random_state=42)

Unnamed: 0,title,platform,review_critic,cluster,meta_score
64103,Disney/Pixar Toy Story 3,Xbox 360,MS Xbox World,1,85.0
76235,Dragon Quest Swords: The Masked Queen and the ...,Wii,Play Magazine,0,70.0
115566,Xenoblade Chronicles: Definitive Edition,Switch,Press Start Australia,2,80.0
98719,Ori and the Will of the Wisps,Xbox One,EGM,1,80.0
55762,Guitar Hero: On Tour,DS,PGNx Media,0,90.0
811,James Bond 007: Everything or Nothing,PlayStation 2,Cincinnati Enquirer,3,90.0
75836,Shrek 2,Xbox,Play Magazine,0,75.0
29361,Lost in Blue 2,DS,1UP,1,50.0
67707,Tron 2.0: Killer App,Xbox,Armchair Empire,0,70.0
126199,Unreal Championship 2: The Liandri Conflict,Xbox,3DAvenue,3,90.0


### 4.3 Calcualte recommendation scores

In [13]:
recommendations = pd.DataFrame({})
for idx, cluster in test.groupby(['cluster']):
    c_recommendations = cluster.groupby(['platform', 'title']).mean().reset_index()
    c_recommendations = c_recommendations.rename(columns={'meta_score':'recommendation_score'})
    recommendations = pd.concat([recommendations, c_recommendations])

test = pd.merge(test, recommendations)
test.sample(n=10, random_state=42)

Unnamed: 0,title,platform,review_critic,cluster,meta_score,recommendation_score
64103,The Escapists 2,Switch,Digitally Downloaded,1,80.0,80.0
76235,Persona 3: Dancing in Moonlight,PlayStation 4,TheSixthAxis,1,80.0,79.166667
115566,"Pokemon: Let's Go, Eevee!",Switch,Press Start Australia,2,90.0,80.8
98719,World of Tanks: Xbox 360 Edition,Xbox 360,Official Xbox Magazine UK,1,70.0,72.0
55762,Steel Battalion: Heavy Armor,Xbox 360,XboxAddict,0,10.0,19.2
811,AeroWings 2: Air Strike,Dreamcast,Hot Games,3,80.0,84.0
75836,Sonic the Hedgehog 4: Episode I,PlayStation 3,Playstation Official Magazine Australia,1,80.0,75.0
29361,SingStar '80s,PlayStation 2,DarkStation,0,80.0,81.25
67707,Victor Vran: Overkill Edition,Xbox One,XboxAddict,0,85.0,85.0
126199,Destruction AllStars,PlayStation 5,Digital Trends,4,40.0,40.0


### 4.4 Mean absolute error

In [18]:
MSE = mean_squared_error(test['meta_score'], test['recommendation_score'])
MAE = mean_absolute_error(test['meta_score'], test['recommendation_score'])

print(f"Mean Squared Error: {MSE}")
print(f"Mean Absolute Error: {MAE}")

Mean Squared Error: 63.90485165454544
Mean Absolute Error: 5.508622348074342
