## 03 - Modelling

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tqdm.pandas()

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [3]:
from eli5 import transform_feature_names

In [5]:
from lib.transformers import RunningTimeImputer, ColumnRemover
from lib.processing import save_to_pkl, load_from_pkl

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [8]:
df = load_from_pkl('combined')

In [9]:
df.set_index('release_id',inplace=True)

## Preparing for Machine Learning

In [10]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value)

In [11]:
del df

Of the above columns, we will only handl ``running_time``, ``average_rating`` and ``units_for_sale``. The rest will not be necessary to handle for the purposes of the models we plan on building

In [12]:
full_information_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [13]:
record_store_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [14]:
record_store_ridge_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','label','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean','format_text', 'no_of_days_since_last_sale'
    ]

In [None]:
record_store_ridge_param_grid = dict(ridge__alpha=np.linspace(900,1000,10))

In [None]:
record_store_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('ridge', GridSearchCV(
        SGDRegressor(
            early_stopping=True,
            max_iter=1000,
            tol=0.001,
            n_iter_no_change=5,
            verbose=1
        ),
        param_grid=record_store_ridge_param_grid,
        cv=5
    ))
])

In [None]:
record_store_ridge_pipe.fit(X_tr,np.log(y_tr))

In [None]:
record_store_ridge_grid_search.best_params_

In [None]:
record_store_ridge_grid_search.best_score_

In [None]:
ridge_pred = record_store_ridge_grid_search.predict(X_te)

In [None]:
r2_score(np.log(y_te),np.exp(ridge_pred))

In [None]:
mean_absolute_error(np.log(y_te),np.exp(ridge_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
random_forest_param_grid = {
    'n_estimators': tuple(range(50,350,50)),
}

In [None]:
random_forest_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('random_forest', GridSearchCV(
        RandomForestRegressor(
            random_state=0,
            n_jobs=-1,
            criterion='mae',
            verbose=50
        ),
        cv=5,
        param_grid=random_forest_param_grid,
        verbose=50
    ))
])

In [None]:
random_forest_pipe.fit(X_tr,np.log(y_tr))

## Extremely Randomized Trees

In [15]:
from sklearn.ensemble import ExtraTreesRegressor

In [18]:
extra_trees_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('random_forest', GridSearchCV(
        ExtraTreesRegressor(
            random_state=0,
            n_jobs=4,
            verbose=50
        ),
        param_grid=random_forest_param_grid,
        verbose=50
    ))
])

In [None]:
extra_trees_pipe.fit(X_tr,np.log(y_tr))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] n_estimators=50 .................................................
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  7.4min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed: 10.8min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed: 10.8min
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed: 10.8min
[Parallel(n_jobs=4)]: Done  12 tasks      

## Catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
record_store_catboost_removal_columns = (
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean', 'no_of_days_since_last_sale'
)


In [None]:
catboost_param_grid = {
        'depth': [4,7,10],
        'learning_rate' : np.logspace(-4,-1,5),
        'l2_leaf_reg': [1,4,9],
        'iterations': [100,300,500]
}

In [None]:
catboost_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_catboost_removal_columns)),
    ('scaler', StandardScaler()),
    ('grid_search', GridSearchCV(
        CatBoostRegressor(
            random_state=0,
            cat_features=['year','format_text']),
        param_grid=catboost_param_grid,verbose=5)
    )
])

In [None]:
catboost_pipe.fit(X_tr,np.log(y_tr))