## 03 - Modelling

## Optional - Colab Setup

In [None]:
def upgrade_runtime_ram():
    meminfo = subprocess.getoutput('cat /proc/meminfo').split('\n')

    memory_info = {entry.split(':')[0]: int(entry.split(':')[1].replace(' kB','').strip()) for entry in meminfo}

    if memory_info['MemTotal'] > 17000000:
        return

    a = []
    while(1):
        a.append('1')

In [None]:
def restart_runtime():
    os.kill(os.getpid(), 9)

In [None]:
def setup_rapids():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    device_name = pynvml.nvmlDeviceGetName(handle)
    if (device_name != b'Tesla T4') and (device_name != b'Tesla P4') and (device_name != b'Tesla P100-PCIE-16GB'):
        print("Wrong GPU - Restarting Runtime")
        restart_runtime()


    # clone RAPIDS AI rapidsai-csp-utils scripts repo
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git

    # install RAPIDS
    !bash rapidsai-csp-utils/colab/rapids-colab.sh 0.13


    # set necessary environment variables 
    dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
    sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
    sys.path

    # update pyarrow & modules 
    exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [None]:
def setup_conda():
    if not 'Miniconda3-4.5.4-Linux-x86_64.sh' in os.listdir():
        !wget https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && bash Miniconda3-4.5.4-Linux-x86_64.sh -bfp /usr/local

    if not ('EPFL-Capstone-Project' in os.listdir()) and (os.getcwd().split('/')[-1] != 'EPFL-Capstone-Project'):
        !git clone https://github.com/helmigsimon/EPFL-Capstone-Project  
    if 'EPFL-Capstone-Project' in os.listdir():
        os.chdir('EPFL-Capstone-Project')

    !conda env create -f environment.yml
    !conda activate exts-ml

In [None]:
def setup_drive():
    #Mounting Google Drive
    global drive
    from google.colab import drive
    drive.mount('/content/drive')

In [6]:
try:
    import sys,os,subprocess
    
    upgrade_runtime_ram()
    setup_drive()

    #Setting up PyPi Packages
    !pip install geopandas sparse-dot-topn pdpipe category-encoders
    import geopandas as gpd
    import sparse_dot_topn.sparse_dot_topn as ct
    import pdpipe as pdp
    import category_encoders

    #Setting up Conda Packages
    setup_conda()
    
    #Initializing NLTK
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')
    
    #Setting up RAPIDS AI
    import pynvml
    setup_rapids()
    
    from cuml import UMAP
    
except ModuleNotFoundError as e:
    print(e)
    print('Not in colab environment, continuing to run locally')
    from umap import UMAP

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/83/c5/3cf9cdc39a6f2552922f79915f36b45a95b71fd343cfc51170a5b6ddb6e8/geopandas-0.7.0-py2.py3-none-any.whl (928kB)
[K     |████████████████████████████████| 931kB 3.4MB/s 
[?25hCollecting sparse-dot-topn
[?25l  Downloading https://files.pythonhosted.org/packages/70/d5/2a3a52acd89344f0c45cae320bd41ee49573caec656834b98c5ea48669b7/sparse_dot_topn-0.2.9.tar.gz (106kB)
[K     |███████

## Imports

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tqdm.pandas()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [None]:
from lib.transformers import *
from lib.pipelines import *
from lib.processing import save_to_pkl, load_from_pkl
from data.util.paths import DATA_PATH

In [None]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

## Setting up DataFrame

In [None]:
api_df = load_from_pkl('api',DATA_PATH)
extracted_df = load_from_pkl('extracted',DATA_PATH)

In [12]:
api_df = api_pipe.fit_transform(api_df)
extracted_df = extracted_pipe.fit_transform(extracted_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
#Remove outliers from extracted_df
extracted_df = OutlierRemover('market_value').fit_transform(extracted_df)

## Reducing Dimensionality of Image Embeddings

In [None]:
with np.load(os.path.join(DATA_PATH,'high_level_features_labelled.npz')) as data:
    image_embedding_df = pd.concat([pd.DataFrame(data[section]) for section in ('release_id','bitmap','features')],axis=1)
    image_embedding_df.columns = ['release_id', 'bitmap'] + ['feature_%s' % i for i in range(1,1281)]

In [None]:
from cuml import UMAP

In [None]:
scaler = StandardScaler()
umap = UMAP(n_components=10)
image_embeddings_scaled = scaler.fit_transform(image_embedding_df.loc[:,['feature_%s' % i for i in range(1,1281)]])


In [17]:
image_embeddings_reduced = umap.fit_transform(image_embeddings_scaled)



In [None]:
image_embeddings_reduced = pd.concat([
      image_embedding_df.loc[:,'release_id'],
      pd.DataFrame(
          image_embeddings_reduced,
          columns = ['images_umap_%s' % i for i in range(image_embeddings_reduced.shape[1])]
      )],
      axis=1
)


Combining datasets

In [None]:
df = api_df.merge(extracted_df,how='inner',on='release_id')
df = df.merge(image_embeddings_reduced,how='inner',on='release_id')

In [None]:
del api_df, extracted_df, image_embedding_df, image_embeddings_scaled, image_embeddings_reduced

In [21]:
col_set = {
    'format': {
        'description': 'format_description_', 
        'name': 'format_name_', 
        'text': ('format_text_clean'),
        'quantity': ('format_quantity')
    },
    'geography': {
        'superregion': 'superregion_',
        'region': 'region_',
        'country': 'country_'
    },
    'timeperiod': {
        'period': 'period_',
        'era': 'era_'
    },
    'genre': 'genre_',
    'style': 'style_',
    'null': None,
    'indicator': lambda x: x.max() == 1 and x.min() == 0,
    'image_embedding': 'image_umap_'
}
column_store = ColumnStore()
column_store.fit(df,col_set)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




## Preparing for Machine Learning

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value)

In [None]:
del df

## Setting up Dummy Regression for baseline

In [None]:
from sklearn.dummy import DummyRegressor

In [None]:
dummy_val = GridSearchCV(
    DummyRegressor(),
    cv=5,
    param_grid = {
        'strategy': ['mean']
    },
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=5
)

In [26]:
dummy_val.fit(X_tr,np.log(y_tr).values)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   16.5s remaining:   24.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   23.3s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DummyRegressor(constant=None, quantile=None,
                                      strategy='mean'),
             iid='deprecated', n_jobs=-1, param_grid={'strategy': ['mean']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_absolute_error', verbose=5)

In [27]:
dummy_val.best_score_

-0.6974924045404828

In [None]:
y_mean_pred = np.full(shape=y_te.shape,fill_value=np.mean(y_te))

In [29]:
r2_score(y_te,y_mean_pred)

1.1102230246251565e-16

In [30]:
mean_absolute_error(y_te,y_mean_pred)

11.95805933297779

Ridge Regression

In [None]:
full_information_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [None]:
record_store_lr_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8,handle_unknown='ignore'), ['year'])
], remainder='passthrough')

In [None]:
record_store_lr_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title',
    ]

In [None]:
from cuml import LinearRegression
from sklearn.model_selection import KFold

In [None]:
record_store_lr_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','master_id','format_name'])),
    ('record_store_column_remover', ColumnRemover(record_store_lr_removal_columns)),
    ('preprocessing',record_store_lr_transformer),
    ('scaler', StandardScaler()),
    ('lr', GridSearchCV(
        LinearRegression(),
        param_grid={
            'normalize':[False,True]
        },
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=5,
        n_jobs=-1
    ))
])

In [36]:
record_store_lr_pipe.fit(X_tr,np.log(y_tr).values)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   27.2s remaining:   18.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   29.3s finished


Pipeline(memory=None,
         steps=[('running_time_imputer',
                 RunningTimeImputer(number_of_tracks='number_of_tracks',
                                    running_time='running_time')),
                ('leave_one_out_encoding',
                 LeaveOneOutEncoder(cols=['artist', 'label', 'format_text',
                                          'master_id', 'format_name'],
                                    drop_invariant=False,
                                    handle_missing='value',
                                    handle_unknown='value', random_state=None,
                                    return_df=True, sigm...
                              estimator=LinearRegression(algorithm='eig', fit_intercept=True, normalize=False, handle=<cuml.common.handle.Handle object at 0x7f9fbeac6f90>, verbose=False, output_type='input'),
                              iid='deprecated', n_jobs=-1,
                              param_grid={'normalize': [False, True]},
           

In [37]:
record_store_lr_pipe['lr'].best_score_

-0.5663880677265023

In [38]:
lr_pred = record_store_lr_pipe.predict(X_te)



In [39]:
r2_score(y_te,np.exp(lr_pred))

0.16651828811419078

In [40]:
mean_absolute_error(y_te,np.exp(lr_pred))

8.61069074312185

In [None]:
from cuml import Ridge

In [None]:
record_store_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','format_name','master_id'])),
    ('record_store_column_remover', ColumnRemover(record_store_lr_removal_columns)),
    ('preprocessing',record_store_lr_transformer),
    ('scaler', StandardScaler()),
    ('ridge', GridSearchCV(
        Ridge(normalize=False,solver='eig'),
        param_grid={
            'alpha': np.linspace(200,300,10),
        },
        cv=KFold(n_splits=5,shuffle=True),
        scoring='neg_mean_absolute_error',
        verbose=5,
        n_jobs=-1
    ))
])

In [43]:
record_store_ridge_pipe.fit(X_tr,np.log(y_tr).values)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.6s finished


Pipeline(memory=None,
         steps=[('running_time_imputer',
                 RunningTimeImputer(number_of_tracks='number_of_tracks',
                                    running_time='running_time')),
                ('leave_one_out_encoding',
                 LeaveOneOutEncoder(cols=['artist', 'label', 'format_text',
                                          'format_name', 'master_id'],
                                    drop_invariant=False,
                                    handle_missing='value',
                                    handle_unknown='value', random_state=None,
                                    return_df=True, sigm...
                              estimator=Ridge(alpha=1.0, solver='eig', fit_intercept=True, normalize=False, handle=<cuml.common.handle.Handle object at 0x7f9fc1ca3710>, output_type='input'),
                              iid='deprecated', n_jobs=-1,
                              param_grid={'alpha': array([200.        , 211.11111111, 222.22222222, 

In [44]:
record_store_ridge_pipe['ridge'].best_params_

{'alpha': 200.0}

In [45]:
ridge_pred = record_store_ridge_pipe.predict(X_te)



In [46]:
r2_score(y_te,np.exp(ridge_pred))

0.1668520763835617

In [47]:
mean_absolute_error(y_te,np.exp(ridge_pred))

8.609243004184272

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
random_forest_param_grid = {
    'n_trees': np.linspace(50,350,50),
    'max_depth': np.linspace(10,30,5)
}

In [None]:
record_store_random_forest_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title'
    ]
record_store_random_forest_removal_columns.extend(
    column_store._indicator
)

In [None]:
full_information_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [None]:
class IndicatorReducer(BaseEstimator,TransformerMixin):
    def __init__(self, indicators, algorithm,components,reduced_column_prefix= 'indicator_reduced'):
        self.indicators = indicators
        self.algorithm = algorithm
        self.components = components
        self.reduced_column_prefix = reduced_column_prefix
  
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        X = X.copy()

        algorithm = self.algorithm(n_components=self.components)

        reduced_indicators = pd.DataFrame(
            algorithm.fit_transform(X.loc[:,self.indicators].values),
            index=X.index,
            columns=['_'.join([self.reduced_column_prefix,str(i)]) for i in range(self.components)]
            )

        return pd.concat([X, reduced_indicators],axis=1)

In [None]:
from cuml import RandomForestRegressor

In [65]:
random_forest_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','format_name','master_id'])),
    ('reduce_indicators', IndicatorReducer(indicators=column_store._indicator,algorithm=UMAP,components=40)),
    ('record_store_column_remover', ColumnRemover(record_store_random_forest_removal_columns)),
    ('cast_to_32', FunctionTransformer(func=lambda x: x.values.astype(np.float32))),
    ('random_forest', GridSearchCV(
        RandomForestRegressor(seed=0,accuracy_metric='mean_ae',n_bins=20,split_criterion=3,max_depth=12,n_estimators=100),
        cv=KFold(n_splits=5,shuffle=True),
        param_grid={
            'n_bins': np.linspace(75,100,5).astype(int),
        },
        verbose=50,
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
    ))
])

  


In [None]:
df_sample = pd.concat([X_tr,y_tr],axis=1).sample(frac=0.1)
X_tr_sample = df_sample.drop('market_value',axis=1)
y_tr_sample = df_sample['market_value']

In [67]:
random_forest_pipe.fit(X_tr_sample,np.log(y_tr_sample).values.astype(np.float32))



Fitting 5 folds for each of 25 candidates, totalling 125 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  7



[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 46.7min
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done  79 tasks      | elapsed: 47.6min
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed: 48.0min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 50.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 50.4min
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed: 51.0min
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed: 51.3min
[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed: 53.4min
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed: 53.9min
[Parallel(n_jobs=-1)]: Done  87 tasks      | elapsed: 54.6min
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed: 54.9min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 57.0min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 57.5min
[Parallel(n_jobs=-1)]: Done  91 tasks      | elapsed: 58.4min
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed: 58.7min
[Paralle



Pipeline(memory=None,
         steps=[('running_time_imputer',
                 RunningTimeImputer(number_of_tracks='number_of_tracks',
                                    running_time='running_time')),
                ('leave_one_out_encoding',
                 LeaveOneOutEncoder(cols=['artist', 'label', 'format_text',
                                          'format_name', 'master_id'],
                                    drop_invariant=False,
                                    handle_missing='value',
                                    handle_unknown='value', random_state=None,
                                    return_df=True, sigm...
                              estimator=RandomForestRegressor(n_estimators=100, max_depth=15, handle=<cuml.common.handle.Handle object at 0x7f9fbf89c1f0>, max_features='auto', n_bins=20, n_streams=8, split_algo=1, split_criterion=3, bootstrap=True, bootstrap_features=False, verbose=False, min_rows_per_node=2, rows_sample=1.0, max_leaves=-1, accurac

In [68]:
random_forest_pipe['random_forest'].best_score_

-0.027724644914269448

In [69]:
random_forest_pipe['random_forest'].best_params_

{'max_depth': 12, 'n_bins': 100}

In [70]:
random_forest_pred = random_forest_pipe.predict(X_te)



In [71]:
r2_score(y_te,np.exp(random_forest_pred))

-0.08538489558775608

In [72]:
mean_absolute_error(y_te,np.exp(random_forest_pred))

10.634478758743661

## Catboost

In [None]:
!pip install catboost
from catboost import CatBoostRegressor

In [None]:
record_store_catboost_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title'
    ]
record_store_catboost_removal_columns.extend(
    column_store._indicator
)

In [None]:
catboost_param_grid = {
        'depth': np.linspace(1,16,5).astype(int),
        'learning_rate' : np.logspace(-4,-1,5),
        'l2_leaf_reg': np.linspace(1,50,5).astype(int),
        'iterations': np.linspace(50,250,3),
        'n_estimators': np.linspace(20,200,5).astype(int)
}

In [None]:
catboost_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('reduce_indicators', IndicatorReducer(indicators=column_store._indicator-column_store._geography,algorithm=UMAP,components=40)),
    ('record_store_column_remover', ColumnRemover(record_store_catboost_removal_columns)),   
    ('grid_search', GridSearchCV(
        CatBoostRegressor(
            random_seed=0,
            cat_features=['year','format_text','master_id','format_name','artist','label','country'],
            one_hot_max_size=25,
            task_type='GPU',
            devices='0:1',
            boosting_type='Ordered',
            verbose=True            
        ),
        param_grid={'depth': np.linspace(1,16,5).astype(int)},
        n_jobs=1,
        scoring='neg_mean_absolute_error',
        verbose=50)
    )
])

In [None]:
catboost_pipe.fit(X_tr_sample,np.log(y_tr_sample))

In [None]:
catboost_pipe.best_score_

In [None]:
catboost_pipe.best_params_

In [None]:
catboost_pred = catboost_pipe.predict(X_te)

In [None]:
r2_score(y_te,np.exp(catboost_pred))

In [None]:
mean_absolute_error(y_te,np.exp(catboost_pred))