## 03 - Modelling

## Optional - Colab Setup

In [1]:
def upgrade_runtime_ram():
    meminfo = subprocess.getoutput('cat /proc/meminfo').split('\n')

    memory_info = {entry.split(':')[0]: int(entry.split(':')[1].replace(' kB','').strip()) for entry in meminfo}

    if memory_info['MemTotal'] > 17000000:
        return

    a = []
    while(1):
        a.append('1')

In [2]:
def restart_runtime():
    os.kill(os.getpid(), 9)

In [3]:
def setup_rapids():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    device_name = pynvml.nvmlDeviceGetName(handle)
    if (device_name != b'Tesla T4') and (device_name != b'Tesla P4') and (device_name != b'Tesla P100-PCIE-16GB'):
        print("Wrong GPU - Restarting Runtime")
        restart_runtime()


    # clone RAPIDS AI rapidsai-csp-utils scripts repo
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git

    # install RAPIDS
    !bash rapidsai-csp-utils/colab/rapids-colab.sh 0.13


    # set necessary environment variables 
    dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
    sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
    sys.path

    # update pyarrow & modules 
    exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [4]:
def setup_conda():
    if not 'Miniconda3-4.5.4-Linux-x86_64.sh' in os.listdir():
        !wget https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && bash Miniconda3-4.5.4-Linux-x86_64.sh -bfp /usr/local

    if not ('EPFL-Capstone-Project' in os.listdir()) and (os.getcwd().split('/')[-1] != 'EPFL-Capstone-Project'):
        !git clone https://github.com/helmigsimon/EPFL-Capstone-Project  
    if 'EPFL-Capstone-Project' in os.listdir():
        os.chdir('EPFL-Capstone-Project')

    !conda env create -f environment.yml
    !conda activate exts-ml

In [5]:
def setup_drive():
    #Mounting Google Drive
    global drive
    from google.colab import drive
    drive.mount('/content/drive')

In [6]:
try:
    import sys,os,subprocess
    
    upgrade_runtime_ram()
    setup_drive()

    #Setting up PyPi Packages
    !pip install geopandas sparse-dot-topn pdpipe category-encoders catboost
    import geopandas as gpd
    import sparse_dot_topn.sparse_dot_topn as ct
    import pdpipe as pdp
    import category_encoders

    #Setting up Conda Packages
    setup_conda()
    
    #Initializing NLTK
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')
    
    #Setting up RAPIDS AI
    import pynvml
    setup_rapids()
    
    from cuml import UMAP
    
except ModuleNotFoundError as e:
    print(e)
    print('Not in colab environment, continuing to run locally')
    from umap import UMAP

ValueError: invalid literal for int() with base 10: '/proc/meminfo'

## Imports

In [7]:
%matplotlib inline
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tqdm.pandas()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import KFold
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

In [9]:
from lib.transformers import *
from lib.pipelines import *
from lib.processing import save_to_pkl, load_from_pkl
from lib.visualization import GridSearchPlotter
from data.util.paths import DATA_PATH

In [10]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [11]:
try:
    from cuml import LinearRegression, RandomForestRegressor, Ridge
except ModuleNotFoundError:
    from sklearn.linear_model import SGDRegressor

In [12]:
from catboost import CatBoostRegressor

## Loading Data

In [13]:
metadata_df, image_embeddings_df = tuple(load_from_pkl(i,DATA_PATH) for i in ('metadata','image_embeddings'))
df = metadata_df.merge(image_embeddings_df,how='inner',on='release_id')
df = OutlierRemover(features=['number_of_tracks'] + [i for i in df.columns if 'umap' in i]).fit_transform(df[(df['running_time'] < 180) | (df['running_time'].isna())])

In [14]:
del metadata_df, image_embeddings_df

In [15]:
col_set = {
    'format': {
        'description': 'format_description_', 
        'name': 'format_name_', 
        'text': ('format_text_clean'),
        'quantity': ('format_quantity')
    },
    'geography': {
        'superregion': 'superregion_',
        'region': 'region_',
        'country': 'country_'
    },
    'timeperiod': {
        'period': 'period_',
        'era': 'era_'
    },
    'genre': 'genre_',
    'style': 'style_',
    'null': None,
    'indicator': lambda x: x.max() == 1 and x.min() == 0,
    'image_embedding': 'image_umap_'
}
column_store = ColumnStore()
column_store.fit(df,col_set)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




## Preparing for Machine Learning

In [16]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value)

In [17]:
df_sample = pd.concat([X_tr,y_tr],axis=1).sample(frac=0.1)
X_tr_sample = df_sample.drop('market_value',axis=1)
y_tr_sample = df_sample['market_value']

In [18]:
del df

## Setting up Dummy Regression for baseline

In [None]:
dummy_regressor = DummyRegressor(strategy='mean')

In [None]:
dummy_regressor.fit(X_tr,np.log(y_tr).values)

In [None]:
dummy_pred = dummy_regressor.predict(X_te)

In [None]:
r2_score(y_te,np.exp(dummy_pred))

In [None]:
mean_absolute_error(y_te,np.exp(dummy_pred))

## Linear Regression - Record Store

In [31]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
record_store_lr_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8,handle_unknown='ignore'), ['year'])
], remainder='passthrough')

In [33]:
record_store_lr_removal_columns = set([
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title', 'release_id'
])

In [None]:
record_store_lr_processing = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','master_id','format_name'])),
    ('record_store_column_remover', ColumnRemover(record_store_lr_removal_columns)),
    ('preprocessing',record_store_lr_transformer),
    ('scaler', RobustScaler()),                             
])

record_store_lr_pipe = Pipeline([
    ('processing', record_store_lr_processing),
    ('lr', GridSearchCV(
        LinearRegression(),
        param_grid={
            'normalize': [True, False]
        },
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=5,
        n_jobs=-1,
    ))
])

In [None]:
record_store_lr_pipe.steps[-1][-1].best_score_

In [None]:
record_store_lr_pred = record_store_lr_pipe.predict(X_te)

In [None]:
r2_score(y_te,np.exp(record_store_lr_pred))

In [None]:
mean_absolute_error(y_te,np.exp(record_store_lr_pred))

## Linear Regression - Full Information

In [None]:
full_information_lr_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8,handle_unknown='ignore'), ['year'])
], remainder='passthrough')

In [34]:
full_information_columns = set([
    'units_for_sale','community_have','community_want','average_rating','rating_count'
])
full_information_lr_removal_columns = record_store_lr_removal_columns - full_information_columns

In [None]:
full_information_lr_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','master_id','format_name'])),
    ('record_store_column_remover', ColumnRemover(full_information_lr_removal_columns)),
    ('preprocessing',full_information_lr_transformer),
    ('scaler', RobustScaler()),
    ('lr', GridSearchCV(
        LinearRegression(),
        param_grid={
            'normalize':[False,True]
        },
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=5,
        n_jobs=-1,
    ))
])

In [None]:
full_information_lr_pipe.fit(X_tr,np.log(y_tr).values)

In [None]:
full_information_lr_pipe['lr'].best_score_

In [None]:
full_information_lr_pred = record_store_lr_pipe.predict(X_te)

In [None]:
r2_score(y_te,np.exp(full_information_lr_pred))

In [None]:
mean_absolute_error(y_te,np.exp(full_information_lr_pred))

## Lasso Regression - Record Store

## Lasso Regression - Full Information

## Ridge Regression - Record Store

In [None]:
record_store_ridge_removal_columns, record_store_ridge_transformer, full_information_ridge_removal_columns, full_information_ridge_transformer = (
    record_store_lr_removal_columns,
    record_store_lr_transformer,
    full_information_lr_removal_columns,
    full_information_lr_transformer
)

In [None]:
record_store_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','format_name','master_id'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_ridge_transformer),
    ('scaler', RobustScaler()),
    ('ridge', GridSearchCV(
        Ridge(normalize=False,solver='eig'),
        param_grid={
            'alpha': np.linspace(1e-3,1e3,10),
        },
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=5,
        n_jobs=-1,
        return_train_score=True
    ))
])

In [None]:
record_store_ridge_pipe.fit(X_tr,np.log(y_tr).values)

In [None]:
plotter = GridSearchPlotter(record_store_ridge_pipe['ridge'])

In [None]:
plotter.plot_validation_curve(parameter='alpha',ylim=(-0.75,-0.25))

In [None]:
record_store_ridge_pipe['ridge'].best_score_

In [None]:
record_store_ridge_pipe['ridge'].best_params_

In [None]:
record_store_ridge_pred = record_store_ridge_pipe.predict(X_te)

In [None]:
r2_score(y_te,np.exp(record_store_ridge_pred))

In [None]:
mean_absolute_error(y_te,np.exp(record_store_ridge_pred))

## Ridge - Full Information

In [None]:
full_information_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','format_name','master_id'])),
    ('record_store_column_remover', ColumnRemover(full_information_ridge_removal_columns)),
    ('preprocessing',full_information_ridge_transformer),
    ('scaler', StandardScaler()),
    ('ridge', GridSearchCV(
        Ridge(normalize=False,solver='eig'),
        param_grid={
            'alpha': np.logspace(-3,3,10),
        },
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=5,
        n_jobs=-1,
        return_train_score=True
    ))
])

In [None]:
full_information_ridge_pipe.fit(X_tr,np.log(y_tr).values)

In [None]:
plotter = GridSearchPlotter(full_information_ridge_pipe['ridge'])

In [None]:
plotter.plot_validation_curve(parameter='alpha',ylim=(-0.75,-0.25))

In [None]:
full_information_ridge_pipe['ridge'].best_score_

In [None]:
full_information_ridge_pipe['ridge'].best_params_

In [None]:
full_information_ridge_pred = full_information_ridge_pipe.predict(X_te)

In [None]:
r2_score(y_te,np.exp(full_information_ridge_pred))

In [None]:
mean_absolute_error(y_te,np.exp(full_information_ridge_pred))

## Random Forest - Record Store

In [None]:
record_store_random_forest_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title','release_id'
]
record_store_random_forest_removal_columns.extend(
    column_store._indicator
)
record_store_random_forest_removal_columns = set(record_store_random_forest_removal_columns)

In [None]:
record_store_random_forest_preprocessing_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','format_name','master_id','year'])),
    ('reduce_indicators', IndicatorReducer(indicators=column_store._indicator,algorithm=PCA,components=40)),
    ('record_store_column_remover', ColumnRemover(record_store_random_forest_removal_columns)),
    ('cast_to_32', FunctionTransformer(func=lambda x: x.values.astype(np.float32)))
])

random_forest = RandomForestRegressor()

record_store_random_forest_search_pipe = Pipeline([
    ('preprocessing',record_store_random_forest_preprocessing_pipe),
    ('grid_search', GridSearchCV(
        random_forest,
        cv=KFold(n_splits=5,shuffle=True),
        param_grid={
            'n_estimators': np.linspace(10,200,5).astype(int)
        },
        verbose=50,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        return_train_score=True
    ))
])

record_store_random_forest_pipe = Pipeline([
    ('preprocessing',record_store_random_forest_preprocessing_pipe),
    ('random_forest',random_forest)
])

In [None]:
record_store_random_forest_search_pipe.fit(X_tr_sample,np.log(y_tr_sample).values.astype(np.float32))

In [None]:
record_store_random_forest_plotter = GridSearchPlotter(record_store_random_forest_search_pipe.steps[-1][1])
record_store_random_forest_plotter.plot_validation_curve(parameter='n_estimators',ylim=(-1,0))

In [None]:
record_store_random_forest_search_pipe.steps[-1][1].best_score_

In [None]:
record_store_random_forest_search_pipe.steps[-1][1].best_params_

In [None]:
record_store_random_forest_pipe.fit(X_tr,np.log(y_tr).values.astype(np.float32))

In [None]:
record_store_random_forest_te_pred = record_store_random_forest_pipe.predict(X_te)

In [None]:
pd.Series(record_store_random_forest_te_pred).describe()

In [None]:
r2_score(y_te,np.exp(record_store_random_forest_te_pred))

In [None]:
mean_absolute_error(y_te,np.exp(record_store_random_forest_te_pred))

## Random Forest - Full Information

In [None]:
full_information_random_forest_removal_columns = record_store_random_forest_removal_columns - full_information_columns

In [None]:
full_information_random_forest_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
], remainder='passthrough')

In [None]:
full_information_random_forest_preprocessing_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','format_name','master_id','year'])),
    ('reduce_indicators', IndicatorReducer(indicators=column_store._indicator,algorithm=PCA,components=40)),
    ('record_store_column_remover', ColumnRemover(full_information_random_forest_removal_columns)),
    ('cast_to_32', FunctionTransformer(func=lambda x: x.values.astype(np.float32)))
])

random_forest = RandomForestRegressor()

full_information_random_forest_search_pipe = Pipeline([
    ('preprocessing',full_information_random_forest_preprocessing_pipe),
    ('grid_search', GridSearchCV(
        random_forest,
        cv=KFold(n_splits=5,shuffle=True),
        param_grid={
            'n_estimators': np.linspace(10,200,5).astype(int)
        },
        verbose=50,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        return_train_score=True
    ))
])

full_information_random_forest_pipe = Pipeline([
    ('preprocessing',random_forest_preprocessing_pipe),
    ('random_forest',random_forest)
])

In [None]:
full_information_random_forest_search_pipe.fit(X_tr_sample,np.log(y_tr_sample))

In [None]:
full_information_random_forest_plotter = GridSearchPlotter(random_forest_search_pipe.steps[-1][1])
full_information_random_forest_plotter.plot_validation_curve(parameter='n_estimators',ylim=(-1,0))

In [None]:
full_information_random_forest_search_pipe.steps[-1][1].best_score_

In [None]:
full_information_random_forest_search_pipe.steps[-1][1].best_params_

In [None]:
full_information_random_forest_pipe.fit(X_tr,np.log(y_tr).values.astype(np.float32))

In [None]:
full_information_random_forest_te_pred = random_forest_pipe.predict(X_te)

In [None]:
full_information_r2_score(y_te,np.exp(random_forest_te_pred))

In [None]:
full_information_mean_absolute_error(y_te,np.exp(random_forest_te_pred))

## Catboost - Record Store

In [19]:
record_store_catboost_removal_columns = set([
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title','release_id'
])

In [23]:
record_store_catboost_processing_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('record_store_column_remover', ColumnRemover(record_store_catboost_removal_columns)),
])

catboost = CatBoostRegressor(
    random_seed=0,
    cat_features=['year','format_text','master_id','format_name','artist','label','country'],
    boosting_type='Ordered',
    n_estimators=100,
    learning_rate=0.16681005,
    l2_leaf_reg=3,
    border_count=170,
    max_depth=10,
)

record_store_catboost_grid_pipe = Pipeline([
    ('processing', record_store_catboost_processing_pipe),
    ('grid_search', GridSearchCV(
        catboost,
        param_grid={
            'learning_rate': np.linspace(1e-3,1e1,5),
            'l2_leaf_reg': [1,3,5,8],
            'border_count': np.linspace(1,200,5)
        },
        n_jobs=-1,
        cv=KFold(n_splits=5,shuffle=True),
        verbose=1,
        scoring='neg_mean_absolute_error',
        return_train_score=True)
    )
])

record_store_catboost_pipe = Pipeline([
    ('processing', record_store_catboost_processing_pipe),
    ('catboost',catboost)
])

In [25]:
record_store_catboost_grid_pipe.fit(X_tr_sample,np.log(y_tr_sample))

0:	learn: 0.8537733	total: 302ms	remaining: 29.9s
1:	learn: 0.8317071	total: 454ms	remaining: 22.2s
2:	learn: 0.8132580	total: 774ms	remaining: 25s
3:	learn: 0.7986971	total: 1.08s	remaining: 26s
4:	learn: 0.7870391	total: 1.4s	remaining: 26.6s
5:	learn: 0.7758067	total: 1.73s	remaining: 27.1s
6:	learn: 0.7673244	total: 2.11s	remaining: 28s
7:	learn: 0.7603116	total: 2.43s	remaining: 27.9s
8:	learn: 0.7556745	total: 2.79s	remaining: 28.2s
9:	learn: 0.7508394	total: 3.1s	remaining: 27.9s
10:	learn: 0.7459590	total: 3.4s	remaining: 27.5s
11:	learn: 0.7411240	total: 3.71s	remaining: 27.2s
12:	learn: 0.7374160	total: 4.01s	remaining: 26.9s
13:	learn: 0.7336068	total: 4.32s	remaining: 26.5s
14:	learn: 0.7311829	total: 4.63s	remaining: 26.3s
15:	learn: 0.7286557	total: 4.97s	remaining: 26.1s
16:	learn: 0.7268638	total: 5.3s	remaining: 25.9s
17:	learn: 0.7248974	total: 5.35s	remaining: 24.4s
18:	learn: 0.7232668	total: 5.67s	remaining: 24.2s
19:	learn: 0.7222394	total: 6.02s	remaining: 24.1s


KeyboardInterrupt: 

In [None]:
record_store_catboost_plot = GridSearchPlotter(catboost_grid_pipe.steps[-1][1])
record_store_catboost_plot.plot_validation_curve(parameter='max_depth',ylim=(-1,0))

In [None]:
record_store_catboost_grid_pipe.steps[-1][1].best_score_

In [None]:
record_store_catboost_grid_pipe.steps[-1][1].best_params_

In [26]:
record_store_catboost_pipe.fit(X_tr,np.log(y_tr))

0:	learn: 0.8539268	total: 659ms	remaining: 1m 5s
1:	learn: 0.8268796	total: 1.31s	remaining: 1m 4s
2:	learn: 0.8011497	total: 1.97s	remaining: 1m 3s
3:	learn: 0.7816639	total: 2.61s	remaining: 1m 2s
4:	learn: 0.7667739	total: 3.25s	remaining: 1m 1s
5:	learn: 0.7552288	total: 3.87s	remaining: 1m
6:	learn: 0.7414968	total: 4.49s	remaining: 59.7s
7:	learn: 0.7302300	total: 5.2s	remaining: 59.8s
8:	learn: 0.7226581	total: 5.87s	remaining: 59.3s
9:	learn: 0.7164417	total: 6.51s	remaining: 58.6s
10:	learn: 0.7118821	total: 7.13s	remaining: 57.7s
11:	learn: 0.7075541	total: 7.75s	remaining: 56.8s
12:	learn: 0.7048721	total: 8.4s	remaining: 56.2s
13:	learn: 0.7024413	total: 9.03s	remaining: 55.5s
14:	learn: 0.7006789	total: 9.76s	remaining: 55.3s
15:	learn: 0.6988637	total: 10.4s	remaining: 54.8s
16:	learn: 0.6965893	total: 11.1s	remaining: 54.3s
17:	learn: 0.6940425	total: 11.8s	remaining: 53.6s
18:	learn: 0.6928498	total: 12.5s	remaining: 53.2s
19:	learn: 0.6914388	total: 13.1s	remaining: 5

Pipeline(memory=None,
     steps=[('processing', Pipeline(memory=None,
     steps=[('running_time_imputer', RunningTimeImputer(number_of_tracks='number_of_tracks',
          running_time='running_time')), ('record_store_column_remover', ColumnRemover(cols_to_remove={'lowest', 'units_for_sale', 'thumb_url', 'last_sold', 'track..._want', 'rating_count'}))])), ('catboost', <catboost.core.CatBoostRegressor object at 0x164b4deb8>)])

In [27]:
record_store_catboost_te_pred = record_store_catboost_pipe.predict(X_te)

In [28]:
r2_score(y_te,np.exp(record_store_catboost_te_pred))

0.2374316391159783

In [29]:
mean_absolute_error(y_te,np.exp(record_store_catboost_te_pred))

7.990427376956115

## CatBoost - Full Information

In [35]:
full_information_catboost_removal_columns = record_store_catboost_removal_columns - full_information_columns

In [36]:
full_information_catboost_processing_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('record_store_column_remover', ColumnRemover(record_store_catboost_removal_columns))
])

full_information_catboost_grid_pipe = Pipeline([
    ('processing', full_information_catboost_processing_pipe),
    ('grid_search', GridSearchCV(
        catboost,
        param_grid={'max_depth': np.linspace(10,32,5).astype(int)},
        n_jobs=-1,
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=50,
        return_train_score=True)
    )
])

In [None]:
full_information_catboost_grid_pipe.fit(X_tr_sample,np.log(y_tr_sample))

In [None]:
full_information_catboost_plot = GridSearchPlotter(catboost_grid_pipe.steps[-1][1])
full_information_catboost_plot.plot_validation_curve(parameter='max_depth',ylim=(-1,0))

In [None]:
full_information_catboost_grid_pipe.steps[-1][1].best_score_

In [None]:
full_information_catboost_grid_pipe.steps[-1][1].best_params_

In [37]:
full_information_catboost_pipe.fit(X_tr,np.log(y_tr))

0:	learn: 0.8539268	total: 700ms	remaining: 1m 9s
1:	learn: 0.8268796	total: 1.33s	remaining: 1m 5s
2:	learn: 0.8011497	total: 1.98s	remaining: 1m 4s
3:	learn: 0.7816639	total: 2.62s	remaining: 1m 2s
4:	learn: 0.7667739	total: 3.27s	remaining: 1m 2s
5:	learn: 0.7552288	total: 3.99s	remaining: 1m 2s
6:	learn: 0.7414968	total: 4.64s	remaining: 1m 1s
7:	learn: 0.7302300	total: 5.29s	remaining: 1m
8:	learn: 0.7226581	total: 5.98s	remaining: 1m
9:	learn: 0.7164417	total: 6.63s	remaining: 59.7s
10:	learn: 0.7118821	total: 7.26s	remaining: 58.7s
11:	learn: 0.7075541	total: 7.9s	remaining: 57.9s
12:	learn: 0.7048721	total: 8.54s	remaining: 57.2s
13:	learn: 0.7024413	total: 9.17s	remaining: 56.3s
14:	learn: 0.7006789	total: 9.79s	remaining: 55.4s
15:	learn: 0.6988637	total: 10.4s	remaining: 54.8s
16:	learn: 0.6965893	total: 11.1s	remaining: 54.3s
17:	learn: 0.6940425	total: 11.8s	remaining: 53.6s
18:	learn: 0.6928498	total: 12.4s	remaining: 52.8s
19:	learn: 0.6914388	total: 13s	remaining: 52.1s

Pipeline(memory=None,
     steps=[('processing', Pipeline(memory=None,
     steps=[('running_time_imputer', RunningTimeImputer(number_of_tracks='number_of_tracks',
          running_time='running_time')), ('record_store_column_remover', ColumnRemover(cols_to_remove={'lowest', 'units_for_sale', 'thumb_url', 'last_sold', 'track..._want', 'rating_count'}))])), ('catboost', <catboost.core.CatBoostRegressor object at 0x164b4deb8>)])

In [39]:
full_information_catboost_te_pred = full_information_catboost_pipe.predict(X_te)

In [40]:
r2_score(y_te,np.exp(full_information_catboost_te_pred))

0.2374316391159783

In [41]:
mean_absolute_error(y_te,np.exp(full_information_catboost_te_pred))

7.990427376956115

## Identification of Feature Importances

In [63]:
feature_importance_transformation_pipe = full_information_catboost_processing_pipe.fit(X_tr,np.log(y_tr).values)
full_information_catboost_feature_importances = dict(zip(
    feature_importance_transformation_pipe.transform(X_te).columns,
    full_information_catboost_pipe.steps[-1][-1].feature_importances_
))

In [71]:
pd.Series(full_information_catboost_feature_importances).sort_values(ascending=False).head(20)

country                               19.117469
format_name                           18.763870
label                                 11.618611
year                                   9.672260
artist                                 9.208925
number_of_tracks                       3.850768
master_id                              2.847195
style_Easy Listening                   2.067571
superregion_asia                       1.962749
format_quantity                        1.854221
format_description_Limited Edition     1.435032
format_description_Reissue             1.361851
era_modern                             1.341501
format_description_Promo               1.171903
format_text                            1.060279
style_Free Jazz                        1.053303
format_description_Test Pressing       0.990353
style_Hard Bop                         0.950603
region_north america                   0.895790
superregion_europe                     0.734825
dtype: float64