In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dill, lzma

In [2]:
df = pd.read_pickle("df.pkl")
print(len(df))
df.head(5)

23530


Unnamed: 0,year,age,poll_age,num_rated,rating,bayes_rating,owned,attrs,npl_min,npl_max,poll_npl_min,poll_npl_max,dur_min,dur_max
387,2000,10,10.5,337,5.81844,5.5509,697,"[Card Game, Hand Management, Series, Bluffing]",3,7,3,7,3.806662,3.806662
475,2000,12,11.454545,7149,7.27751,6.93109,8569,"[Series, Bluffing, Auction, Drafting, Card Dra...",2,5,3,5,3.806662,4.49981
478,2000,10,9.898396,49176,7.07999,6.97364,65605,"[Tableau Building, Series, Bluffing, Drafting,...",2,8,2,7,2.995732,4.094345
481,2000,12,10.785714,3429,6.81879,6.38725,3909,"[Team-Based Game, Area Majority / Influence, D...",2,4,2,4,4.094345,4.094345
490,2000,12,10.2,262,7.70292,5.65154,340,"[Print & Play, Wargame, Dice, Hexagon Grid, My...",2,6,2,6,4.094345,4.094345


In [3]:
all_attrs = sorted(set(df['attrs'].sum()))
len(all_attrs)

407

In [4]:
all_attrs[:5]

['3-Dimensional (3D)', '4X', 'Abstract Strategy', 'Acting', 'Action Drafting']

In [5]:
class RatingNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.m = {
            key: vals for key,*vals in
            X.groupby('year').agg({'rating':['mean','std']}).to_records()
        }
        return self
    def transform(self, X):
        def scale(x):
            mean, std = self.m[x[0]]
            return (x[1]-mean)/std
        return X[['year','rating']].apply(scale,axis=1)

In [6]:
attrs_encoder = ColumnTransformer([
    ('attrs',FunctionTransformer(lambda col:
        np.array([
            [ 1 if a in attrs else 0 for a in all_attrs ]
            for attrs in col
        ], dtype=bool)
    ),'attrs')
])

In [7]:
encoded_attrs = attrs_encoder.fit_transform(df)

In [8]:
y_all = RatingNormalizer().fit_transform(df).to_numpy()

In [9]:
pca = PCA(n_components=64)
pca.fit(encoded_attrs)

PCA(n_components=64)

In [10]:
X_pca = pca.transform(encoded_attrs)

In [11]:
np.concatenate(([[1],[2]],[['a'],['b']]),axis=1)

array([['1', 'a'],
       ['2', 'b']], dtype='<U21')

In [12]:
X2 = df[['poll_age','poll_npl_min','poll_npl_max','dur_min','dur_max']].to_numpy()

In [13]:
X = np.concatenate((X2,X_pca),axis=1)
X.shape

(23530, 69)

In [21]:
forest = RandomForestRegressor(
    n_estimators=1000, max_depth=12, max_features=24, n_jobs=-1)
forest.fit(X, y_all)

RandomForestRegressor(max_depth=12, max_features=24, n_estimators=1000,
                      n_jobs=-1)

In [18]:
ny = 2021 - 2000 + 1
trend_all = np.zeros((ny,3))
trends = np.zeros((len(df),ny,3))

for attrs,y,r in zip(
    encoded_attrs,
    *(df[x] for x in ['year','rating'])
):
    y -= 2000
    r = [1,r,r*r]
    trend_all[y] += r
    for attr in np.nonzero(attrs)[0]:
        trends[attr,y] += r

In [22]:
# with lzma.open('model.dill.xz','wb') as f:
with open('model.dill','wb') as f:
    dill.dump({
        'all_attrs': all_attrs,
        'pca': pca,
        'forest': forest,
        'trend_all': trend_all,
        'trends': trends
    },f)