## 03 - Modelling

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
tqdm.pandas()

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [3]:
from eli5 import transform_feature_names

In [4]:
from lib.processing import save_to_pkl, load_from_pkl

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [7]:
df = load_from_pkl('combined')

In [8]:
df.set_index('release_id',inplace=True)

## Preparing for Machine Learning

In [9]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value)

In [10]:
class RunningTimeImputer(BaseEstimator, TransformerMixin):
    def __init__(self,running_time, number_of_tracks):
        self.running_time = running_time
        self.number_of_tracks = number_of_tracks
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        
        if not hasattr(self,'average_time_per_track'):
            self.average_time_per_track = X.loc[:,self.running_time].mean() / X.loc[:,self.number_of_tracks].mean()
            
        null_indices = X[X.loc[:,self.running_time].isna()].index
        
        X.loc[null_indices,self.running_time] = X.loc[null_indices,self.number_of_tracks] * self.average_time_per_track
        
        return X
        

In [11]:
class ColumnRemover(BaseEstimator,TransformerMixin):
    def __init__(self,cols_to_remove):
        self.cols_to_remove = cols_to_remove
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        
        if type(self.cols_to_remove) == tuple:
            self.cols_to_remove = list(self.cols_to_remove)
        elif type(self.cols_to_remove) != list:
            raise TypeError
            
        return X.drop(list(self.cols_to_remove),axis=1)

In [12]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value)

Of the above columns, we will only handl ``running_time``, ``average_rating`` and ``units_for_sale``. The rest will not be necessary to handle for the purposes of the models we plan on building

In [13]:
full_information_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [14]:
record_store_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [83]:
record_store_ridge_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','label','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean','format_text', 'no_of_days_since_last_sale'
    ]

In [137]:
record_store_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('ridge', SGDRegressor(early_stopping=True,verbose=50,max_iter=100,alpha=10))
])

In [17]:
record_store_ridge_param_grid = dict(ridge__alpha=np.logspace(0,1,2))

In [18]:
record_store_ridge_grid_search = GridSearchCV(record_store_ridge_pipe, param_grid=record_store_ridge_param_grid, cv=3, n_jobs=4, verbose=50, scoring= 'accuracy')

In [23]:
from catboost import CatBoostClassifier

In [53]:
df['format_description'].apply(lambda x: 'üü'.join(x) if type(x)==list else x).str.split('üü',expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,295878,198248,101979,31192,6388,1237,258,46,2,1
unique,64,70,61,51,37,23,13,6,2,1
top,Album,Album,Reissue,Remastered,Stereo,Stereo,Stereo,Stereo,Remastered,Mono
freq,154155,135398,27577,8106,1730,390,93,25,1,1


## CatBoost Encoder

In [73]:
class ListEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,features):
        self.features = features
    
    def fit(self, X, y=None):
        return self

    def split_list(self, X, feature):
        X_new = X.loc[:,feature].apply(lambda x: 'üü'.join(x) if type(x)==list else x).str.split('üü',expand=True)
        return X_new.rename(columns = {col: '_'.join([feature,str(col)]) for col in X_new.columns})
        
    def transform(self, X, y=None):
        X = X.copy()
        
        if type(self.features) == str:
            X = pd.concat([X, self.split_list(X,self.features)],axis=1)
        elif type(self.features) == list or type(self.features) == tuple:
            for feature in self.features:
                X = pd.concat([X, self.split_list(X,feature)],axis=1)
        else:
            raise TypeError
            
        return X

In [74]:
test = ListEncoder(features='format_description')

In [75]:
test.fit_transform(df)

Unnamed: 0_level_0,market_price,units_for_sale,have,want,average_rating,rating_count,last_sold,number_of_tracks,running_time,lowest,...,format_description_0,format_description_1,format_description_2,format_description_3,format_description_4,format_description_5,format_description_6,format_description_7,format_description_8,format_description_9
release_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10550056,3.99,3.0,4.0,4.0,4.00,1.0,NaT,12.0,,,...,"10""",78 RPM,Album,,,,,,,
6910984,7.21,3.0,4.0,2.0,,0.0,2019-03-04,8.0,,5.99,...,"10""",78 RPM,Album,Reissue,,,,,,
8592628,7.75,5.0,17.0,0.0,5.00,2.0,NaT,8.0,,,...,"10""",78 RPM,Album,Compilation,Repress,,,,,
8684706,125.00,1.0,1.0,7.0,,0.0,NaT,8.0,,,...,"10""",78 RPM,Album,,,,,,,
8049748,5.00,2.0,9.0,8.0,,0.0,2016-12-04,12.0,,20.00,...,"10""",78 RPM,Album,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7340022,11.85,10.0,182.0,89.0,4.42,19.0,2019-12-29,5.0,,10.13,...,LP,Album,Reissue,,,,,,,
12058618,5.61,5.0,2.0,2.0,4.00,1.0,NaT,24.0,,,...,Album,,,,,,,,,
11534331,1.32,3.0,13.0,1.0,,0.0,NaT,10.0,75.383333,,...,Album,,,,,,,,,
1572860,17.97,9.0,42.0,36.0,4.50,6.0,2019-03-28,6.0,72.200000,19.10,...,LP,Album,Reissue,,,,,,,


In [None]:
record_store_catboost_removal_columns = (
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean','format_text', 'no_of_days_since_last_sale'
)


In [22]:
record_store_catboost_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('record_store_column_remover', ColumnRemover(record_store_catboost_removal_columns)),
    ('list_encoder', ListEncoder(('format_description','genre','style','label')))
])

Unnamed: 0_level_0,market_price,units_for_sale,have,want,average_rating,rating_count,last_sold,number_of_tracks,running_time,lowest,...,embedding_10d_0,embedding_10d_1,embedding_10d_2,embedding_10d_3,embedding_10d_4,embedding_10d_5,embedding_10d_6,embedding_10d_7,embedding_10d_8,embedding_10d_9
release_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10550056,3.99,3.0,4.0,4.0,4.00,1.0,NaT,12.0,,,...,3.718969,-1.964473,-2.259114,-2.344917,-2.380853,5.133696,-0.715697,2.342036,-5.762625,2.028794
6910984,7.21,3.0,4.0,2.0,,0.0,2019-03-04,8.0,,5.99,...,-0.992168,-7.247248,0.823306,2.923478,-2.794080,-1.135621,-1.719237,-0.410763,0.850969,1.286332
8592628,7.75,5.0,17.0,0.0,5.00,2.0,NaT,8.0,,,...,3.283950,0.601771,-2.925583,-3.262571,-1.369797,4.440640,0.719848,2.997173,-3.470886,2.611470
8684706,125.00,1.0,1.0,7.0,,0.0,NaT,8.0,,,...,2.821412,-8.466662,-3.841349,-5.937275,1.105268,-0.193035,1.767958,5.100076,6.339747,3.243169
8049748,5.00,2.0,9.0,8.0,,0.0,2016-12-04,12.0,,20.00,...,2.877477,-6.993788,-0.021584,-5.529807,2.048635,3.204210,-3.131353,-0.075423,5.698018,3.179359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7340022,11.85,10.0,182.0,89.0,4.42,19.0,2019-12-29,5.0,,10.13,...,0.799063,-1.963545,-2.175674,3.497314,-0.511712,1.404097,-1.556072,3.513657,0.107563,1.094370
12058618,5.61,5.0,2.0,2.0,4.00,1.0,NaT,24.0,,,...,-3.117285,-2.551610,5.853477,-3.608870,0.272484,-2.313371,4.657339,-2.883637,0.543592,-1.115532
11534331,1.32,3.0,13.0,1.0,,0.0,NaT,10.0,75.383333,,...,-2.009118,6.059659,-0.800675,-4.240083,3.450454,-0.978029,-0.811130,-5.342619,-2.024996,3.187478
1572860,17.97,9.0,42.0,36.0,4.50,6.0,2019-03-28,6.0,72.200000,19.10,...,1.013433,2.396143,-1.874267,1.573853,1.168921,5.135166,2.070899,4.141485,-3.840093,0.311023


In [98]:
test_pipe = record_store_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    #('catboost', CatBoostRegressor(random_state=0))
])

In [99]:
test_data_tr = test_pipe.fit_transform(X_tr,np.log(y_tr))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [106]:
test_boost = CatBoostRegressor(verbose=10
)

In [107]:
test_boost.fit(test_data,np.log(y_tr))

Learning rate set to 0.102104
0:	learn: 0.8203076	total: 628ms	remaining: 10m 27s
10:	learn: 0.4395077	total: 5.87s	remaining: 8m 47s
20:	learn: 0.3644375	total: 10.6s	remaining: 8m 12s
30:	learn: 0.3515283	total: 15.5s	remaining: 8m 3s
40:	learn: 0.3477462	total: 20.6s	remaining: 8m 1s
50:	learn: 0.3452666	total: 24.7s	remaining: 7m 40s
60:	learn: 0.3437611	total: 28.9s	remaining: 7m 24s
70:	learn: 0.3425323	total: 33s	remaining: 7m 11s
80:	learn: 0.3418798	total: 37.2s	remaining: 7m 1s
90:	learn: 0.3413715	total: 41.5s	remaining: 6m 54s
100:	learn: 0.3404604	total: 46.4s	remaining: 6m 52s
110:	learn: 0.3400255	total: 50.7s	remaining: 6m 46s
120:	learn: 0.3390813	total: 55s	remaining: 6m 39s
130:	learn: 0.3386875	total: 59.1s	remaining: 6m 32s
140:	learn: 0.3371862	total: 1m 3s	remaining: 6m 26s
150:	learn: 0.3365998	total: 1m 7s	remaining: 6m 18s
160:	learn: 0.3357354	total: 1m 11s	remaining: 6m 13s
170:	learn: 0.3348524	total: 1m 15s	remaining: 6m 6s
180:	learn: 0.3343941	total: 1m 

<catboost.core.CatBoostRegressor at 0x16fa4b5c0>

In [121]:
test_data_te = test_pipe.transform(X_te)

In [122]:
test_boost.score(test_data_te,np.log(y_te))

0.8555989815763626