In [16]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

import dill

In [2]:
df = pd.read_json('bgg_data_clean.json',orient='split')
df.head(5)

Unnamed: 0,year,age,num_players,duration,mechanics,categories,families,num_expansions,num_honors,num_episodes,poll_age,poll_numpl,num_rated,num_comments,rating,bayes_rating
387,2000,10,"[3, 7]","[45.0, 45.0, 45.0]",[Hand Management],"[Bluffing, Card Game]",[Series: Blue Games (Descartes)],0,0,0,"[[2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [8, 0...","[[1, [0, 0, 4]], [2, [0, 0, 4]], [3, [0, 6, 0]...",337,133,5.81844,5.5509
475,2000,12,"[2, 5]","[90.0, 45.0, 90.0]","[Auction, Card Drafting, Hand Management, Netw...","[Bluffing, Political]","[Cities: Agra (India), Country: India, Series:...",0,5,5,"[[2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [8, 1...","[[1, [0, 0, 84]], [2, [0, 14, 82]], [3, [13, 6...",7149,2231,7.27751,6.93109
478,2000,10,"[2, 8]","[60.0, 20.0, 60.0]","[Action Drafting, Card Drafting, Drafting, Los...","[Bluffing, Card Game, City Building, Deduction...","[Mechanism: Tableau Building, Series: Fantasy ...",1,7,17,"[[2, 0], [3, 0], [4, 0], [5, 0], [6, 14], [8, ...","[[1, [1, 4, 612]], [2, [179, 313, 423]], [3, [...",49176,9840,7.07999,6.97364
481,2000,12,"[2, 4]","[60.0, 60.0, 60.0]","[Area Majority / Influence, Dice Rolling, Hand...","[Abstract Strategy, Medieval, Territory Building]",[Digital Implementations: Yucata],0,1,3,"[[2, 0], [3, 0], [4, 0], [5, 1], [6, 0], [8, 0...","[[1, [0, 0, 54]], [2, [35, 54, 7]], [3, [58, 3...",3429,1094,6.81879,6.38725
490,2000,12,"[2, 6]","[60.0, 60.0, 60.0]","[Dice Rolling, Hexagon Grid]","[Fantasy, Mythology, Print & Play, Wargame]",[],0,1,0,"[[2, 0], [3, 0], [4, 0], [5, 0], [6, 1], [8, 4...","[[1, [0, 2, 2]], [2, [8, 3, 0]], [3, [1, 6, 3]...",262,201,7.70292,5.65154


In [3]:
class DictEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for xs in X:
            yield { x: 1 for x in xs }

In [4]:
class RatingNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.m = {
            key: vals for key,*vals in
            X.groupby('year').agg({'rating':['mean','std']}).to_records()
        }
        return self
    def transform(self, X):
        def scale(x):
            mean, std = self.m[x[0]]
            return (x[1]-mean)/std
        return X[['year','rating']].apply(scale,axis=1)

In [5]:
binary_encoder = Pipeline([
    ('encode',   DictEncoder()),
    ('vectorize',DictVectorizer(sparse=False))
])
X_tr = Pipeline([
    ('attrs',ColumnTransformer([
        ('mechanics',binary_encoder,'mechanics'),
        ('categories',binary_encoder,'categories')
    ]))
])

In [6]:
X_tr.fit(df)

Pipeline(steps=[('attrs',
                 ColumnTransformer(transformers=[('mechanics',
                                                  Pipeline(steps=[('encode',
                                                                   DictEncoder()),
                                                                  ('vectorize',
                                                                   DictVectorizer(sparse=False))]),
                                                  'mechanics'),
                                                 ('categories',
                                                  Pipeline(steps=[('encode',
                                                                   DictEncoder()),
                                                                  ('vectorize',
                                                                   DictVectorizer(sparse=False))]),
                                                  'categories')]))])

In [25]:
attr_maps = { t[0]: t[1][1].vocabulary_ for t in X_tr.named_steps['attrs'].transformers_[:-1] }

In [27]:
attr_maps_app = { }
for a, m in attr_maps.items():
    a = a[0]
    n = 0
    for attr, i in m.items():
        attr_maps_app[f'{attr} ({a})'] = i+n
    n += len(m)

In [8]:
X_all = X_tr.transform(df)

In [12]:
y_all = RatingNormalizer().fit_transform(df).to_numpy()

In [9]:
pca = PCA(n_components=64)
pca.fit(X_all)

PCA(n_components=64)

In [10]:
X_pca = pca.transform(X_all)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y_all, test_size=0.35, random_state=123)

In [14]:
forest = RandomForestRegressor(n_estimators=1000, max_depth=8, max_features=10, n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=8, max_features=10, n_estimators=1000,
                      n_jobs=-1)

In [15]:
forest.score(X_test, y_test)

0.4301363458161235

In [22]:
import lzma

In [29]:
with lzma.open('model.dill.xz','wb') as f:
    dill.dump({
        'attr_map': attr_maps_app,
        'pca': pca,
        'forest': forest
    },f)