## 03 - Modelling

## Optional - Colab Setup

In [None]:
def restart_runtime():
    os.kill(os.getpid(), 9)

In [None]:
def setup_rapids():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    device_name = pynvml.nvmlDeviceGetName(handle)
    if (device_name != b'Tesla T4') and (device_name != b'Tesla P4') and (device_name != b'Tesla P100-PCIE-16GB'):
        print("Wrong GPU - Restarting Runtime")
        restart_runtime()


    # clone RAPIDS AI rapidsai-csp-utils scripts repo
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git

    # install RAPIDS
    !bash rapidsai-csp-utils/colab/rapids-colab.sh 0.13


    # set necessary environment variables 
    dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
    sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
    sys.path

    # update pyarrow & modules 
    exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [None]:
def setup_conda():
    if not 'Miniconda3-4.5.4-Linux-x86_64.sh' in os.listdir():
        !wget https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && bash Miniconda3-4.5.4-Linux-x86_64.sh -bfp /usr/local

    if not ('EPFL-Capstone-Project' in os.listdir()) and (os.getcwd().split('/')[-1] != 'EPFL-Capstone-Project'):
        !git clone https://github.com/helmigsimon/EPFL-Capstone-Project  
    os.chdir('EPFL-Capstone-Project')

    !conda env create -f environment.yml
    !conda activate exts-ml

In [None]:
def setup_drive():
    #Mounting Google Drive
    global drive
    from google.colab import drive
    drive.mount('/content/drive')

In [5]:
try:
    import sys,os
    
    setup_drive()

    #Setting up PyPi Packages
    !pip install geopandas sparse-dot-topn pdpipe 
    import geopandas as gpd
    import sparse_dot_topn.sparse_dot_topn as ct
    import pdpipe as pdp

    #Setting up Conda Packages
    setup_conda()
    
    #Initializing NLTK
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')
    
    #Setting up RAPIDS AI
    import pynvml
    setup_rapids()
    
    from cuml import UMAP
    
except ModuleNotFoundError as e:
    print(e)
    print('Not in colab environment, continuing to run locally')
    from umap import UMAP

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Solving environment: failed

ResolvePackageNotFound: 
  - geos==3.8.1=h4a8c4bd_0
  - geotiff==1.5.1=h4bdff65_9
  - freexl==1.0.5=h1de35cc_1002
  - cairo==1.16.0=hec6a9b0_1003
  - libedit==3.1.20170329=hcfe32e1_1001
  - libprotobuf==3.11.4=hd174df1_0
  - llvm-openmp==9.0.1=h28b9765_2
  - tiledb==1.7.0=hd5e958f_2
  - llvmlite==0.31.0=py36hde82470_1
  - tk==8.6.10=hbbe82c9_0
  - psutil==5.7.0=py36h37b9a7d_1
  - xerces-c==3.2.2=h8f8adb3_1004
  - psycopg2==2.8.4=py36hafa8578_1
  - appnope==0.1.0=py36h9f0ad1d_1001
  - libpng==1.6.37=hbbe82c9_1
  - glib==2.58.3=py36hb0ce7ff_1003
  - libsodium==1.0.17=h01d97ff_0
  - tensorflow==1.12.0=mkl_py36h2b2bbaf_0
  - lz4-c==1.8.3=h6de7cb9_1001
  - libuv==1.34.0=h0b31af3_0
  - libgfortran==4.0.0=2
  - graphviz==2.42.3=h98dfb87_0
  - bzip2==1.0.8=h0b31af3_2
  - tensorflow-base==1.12.0=mkl_py36h70e0e9a_0
  - giflib==5.2.1=h0b31af

## Imports

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split,  StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tqdm.pandas()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [None]:
from lib.transformers import *
from lib.pipelines import *
from lib.processing import save_to_pkl, load_from_pkl
from data.util.paths import DATA_PATH

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [10]:
!pip install category_encoders
from category_encoders.leave_one_out import LeaveOneOutEncoder



## Setting up DataFrame

In [None]:
api_df = load_from_pkl('api',DATA_PATH)
extracted_df = load_from_pkl('extracted',DATA_PATH)
with np.load(os.path.join(DATA_PATH,'high_level_features_labelled.npz')) as data:
    image_embedding_df = pd.concat([pd.DataFrame(data[section]) for section in ('release_id','bitmap','features')],axis=1)
    image_embedding_df.columns = ['release_id', 'bitmap'] + ['feature_%s' % i for i in range(1,1281)]

In [None]:
df = api_df.merge(extracted_df,how='inner',on='release_id')
df = df.merge(image_embedding_df,how='inner',on='release_id')
del api_df, extracted_df, image_embedding_df

In [13]:
col_set = {
    'format': {
        'description': 'format_description_', 
        'name': 'format_name_', 
        'text': ('format_text_clean'),
        'quantity': ('format_quantity')
    },
    'geography': {
        'superregion': 'superregion_',
        'region': 'region_',
        'country': 'country_'
    },
    'timeperiod': {
        'period': 'period_',
        'era': 'era_'
    },
    'genre': 'genre_',
    'style': 'style_',
    'null': None,
    'indicator': lambda x: x.max() == 1 and x.min() == 0,
    'image_embedding': 'feature_'
}
column_store = ColumnStore()
column_store.fit(df,col_set)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




## Reducing Dimensionality of Image Embeddings

In [None]:
scaler = StandardScaler()
umap = UMAP(n_components=10)
image_embeddings_scaled = scaler.fit_transform(df.loc[:,sorted(list(column_store._image_embedding))])


In [15]:
image_embeddings_reduced = umap.fit_transform(image_embeddings_scaled)



In [None]:
df = pd.concat([
      df,
      pd.DataFrame(
          image_embeddings_reduced,
          columns = ['umap_%s' % i for i in range(image_embeddings_reduced.shape[1])]
      )],
      axis=1
)

In [None]:
universal_processing_pipe = Pipeline([
    ('unpickle', Unpickler(['track_titles','genre','style','label','formats'])),
    ('make_market_value', ColumnCombiner('median','market_price','market_value')),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('remove_nulls', NullRemover('market_value')),
    ('count_standards', StandardCountEncoder('track_titles',DATA_PATH)),
    ('count_days_since_last_sale',LastSoldEncoder(feature='last_sold',new_feature='days_since_last_sale')),
    ('split_title', TitleSplitter()),
    ('clean_text', clean_text_pipe),
    ('make_format_columns',FormatEncoder())
])

In [None]:
universal_processing_pipe.fit_transform(df)

## Preparing for Machine Learning

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(df.drop('market_value',axis=1),df.market_value)

In [None]:
del df

Of the above columns, we will only handl ``running_time``, ``average_rating`` and ``units_for_sale``. The rest will not be necessary to handle for the purposes of the models we plan on building

In [None]:
full_information_transformer = ColumnTransformer(transformers=[
    ('units_for_sale_imputer', SimpleImputer(strategy='constant',fill_value=0),['units_for_sale']),
    ('average_rating_imputer', SimpleImputer(strategy='mean'),['average_rating']),
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [None]:
record_store_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8), ['year'])
], remainder='passthrough')

In [None]:
record_store_ridge_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','label','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean','format_text', 'no_of_days_since_last_sale'
    ]

In [None]:
record_store_ridge_param_grid = dict(ridge__alpha=np.linspace(900,1000,10))

In [None]:
record_store_ridge_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('ridge', GridSearchCV(
        SGDRegressor(
            early_stopping=True,
            max_iter=1000,
            tol=0.001,
            n_iter_no_change=5,
            verbose=1
        ),
        param_grid=record_store_ridge_param_grid,
        cv=5
    ))
])

In [None]:
record_store_ridge_pipe.fit(X_tr,np.log(y_tr))

In [None]:
record_store_ridge_grid_search.best_params_

In [None]:
record_store_ridge_grid_search.best_score_

In [None]:
ridge_pred = record_store_ridge_grid_search.predict(X_te)

In [None]:
r2_score(np.log(y_te),np.exp(ridge_pred))

In [None]:
mean_absolute_error(np.log(y_te),np.exp(ridge_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
random_forest_param_grid = {
    'n_estimators': tuple(range(50,350,50)),
}

In [None]:
random_forest_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('random_forest', GridSearchCV(
        RandomForestRegressor(
            random_state=0,
            n_jobs=-1,
            criterion='mae',
            verbose=50
        ),
        cv=5,
        param_grid=random_forest_param_grid,
        verbose=50
    ))
])

In [None]:
random_forest_pipe.fit(X_tr,np.log(y_tr))

## Extremely Randomized Trees

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
extra_trees_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_ridge_removal_columns)),
    ('preprocessing',record_store_transformer),
    ('scaler', StandardScaler()),
    ('random_forest', GridSearchCV(
        ExtraTreesRegressor(
            random_state=0,
            n_jobs=4,
            verbose=50
        ),
        param_grid=random_forest_param_grid,
        verbose=50
    ))
])

In [None]:
extra_trees_pipe.fit(X_tr,np.log(y_tr))

## Catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
record_store_catboost_removal_columns = (
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','community_have','community_want','formats','master_id','thumb_url',
    'release_url','artist','title','format_description','format_text_clean', 'no_of_days_since_last_sale'
)


In [None]:
catboost_param_grid = {
        'depth': [4,7,10],
        'learning_rate' : np.logspace(-4,-1,5),
        'l2_leaf_reg': [1,4,9],
        'iterations': [100,300,500]
}

In [None]:
catboost_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist_clean','label_clean'])),
    ('record_store_column_remover', ColumnRemover(record_store_catboost_removal_columns)),
    ('scaler', StandardScaler()),
    ('grid_search', GridSearchCV(
        CatBoostRegressor(
            random_state=0,
            cat_features=['year','format_text']),
        param_grid=catboost_param_grid,verbose=5)
    )
])

In [None]:
catboost_pipe.fit(X_tr,np.log(y_tr))