## 03 - Modelling

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tqdm.pandas()

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [3]:
from eli5 import transform_feature_names

In [4]:
from lib.processing import save_to_pkl, load_from_pkl, RunningTimeImputer, ColumnRemover

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [7]:
df = load_from_pkl('combined')

In [8]:
df.set_index('release_id',inplace=True)

## Preparing for Machine Learning

In [9]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value,test_size=0.2,random_state=0)

In [10]:
X_tr, X_val, y_tr, y_val = train_test_split(X_tr,y_tr,test_size=0.25,random_state=0)

Of the above columns, we will only handl ``running_time``, ``average_rating`` and ``units_for_sale``. The rest will not be necessary to handle for the purposes of the models we plan on building

In [11]:
full_information_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [12]:
record_store_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [13]:
record_store_ridge_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','label','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean','format_text', 'no_of_days_since_last_sale'
    ]

## TabNet

In [14]:
from tabnet import TabNetRegressor
from tensorflow import feature_column

In [15]:
tabnet_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler())
])

In [16]:
tabnet_input = tabnet_pipe.fit_transform(X_tr,np.log(y_tr))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


TabNetRegressor(tab_net_input,1799)

tabnet_feature_columns = tuple(feature_column.numeric_column(str(i)) for i in range(tabnet_input.shape[1]))

tabnet_regressor = TabNetRegressor(feature_columns=tabnet_feature_columns,num_regressors=tabnet_input.shape[1])

tabnet_regressor.compile(optimizer='adam')

tabnet_regressor.fit(tabnet_input,np.log(y_tr).values)

help(tabnet_regressor)

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
rf = RandomForestRegressor(n_estimators=50,verbose=50,n_jobs=-1)

In [21]:
rf.fit(tabnet_input,np.log(y_tr).values)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
building tree 1 of 50building tree 2 of 50building tree 3 of 50building tree 4 of 50building tree 5 of 50building tree 6 of 50building tree 7 of 50building tree 8 of 50









KeyboardInterrupt: 