## Tabnet

In [None]:
def upgrade_runtime_ram():
    meminfo = subprocess.getoutput('cat /proc/meminfo').split('\n')

    memory_info = {entry.split(':')[0]: int(entry.split(':')[1].replace(' kB','').strip()) for entry in meminfo}

    if memory_info['MemTotal'] > 17000000:
        return

    a = []
    while(1):
        a.append('1')

In [None]:
def restart_runtime():
    os.kill(os.getpid(), 9)

In [None]:
def setup_rapids():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    device_name = pynvml.nvmlDeviceGetName(handle)
    if (device_name != b'Tesla T4') and (device_name != b'Tesla P4') and (device_name != b'Tesla P100-PCIE-16GB'):
        print("Wrong GPU - Restarting Runtime")
        restart_runtime()


    # clone RAPIDS AI rapidsai-csp-utils scripts repo
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git

    # install RAPIDS
    !bash rapidsai-csp-utils/colab/rapids-colab.sh 0.13


    # set necessary environment variables 
    dist_package_index = sys.path.index('/usr/local/lib/python3.6/dist-packages')
    sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.6/site-packages'] + sys.path[dist_package_index:]
    sys.path

    # update pyarrow & modules 
    exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

In [None]:
def setup_conda():
    if not 'Miniconda3-4.5.4-Linux-x86_64.sh' in os.listdir():
        !wget https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh && bash Miniconda3-4.5.4-Linux-x86_64.sh -bfp /usr/local

    if not ('EPFL-Capstone-Project' in os.listdir()) and (os.getcwd().split('/')[-1] != 'EPFL-Capstone-Project'):
        !git clone https://github.com/helmigsimon/EPFL-Capstone-Project  
    if 'EPFL-Capstone-Project' in os.listdir():
        os.chdir('EPFL-Capstone-Project')

    !conda env create -f environment.yml
    !conda activate exts-ml

In [None]:
def setup_drive():
    #Mounting Google Drive
    global drive
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
try:
    import sys,os,subprocess
    
    upgrade_runtime_ram()
    setup_drive()

    #Setting up PyPi Packages
    !pip install geopandas sparse-dot-topn pdpipe category-encoders
    import geopandas as gpd
    import sparse_dot_topn.sparse_dot_topn as ct
    import pdpipe as pdp
    import category_encoders

    #Setting up Conda Packages
    setup_conda()
    !pip install tabnet[gpu]
    
    #Initializing NLTK
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')
    
    #Setting up RAPIDS AI
    import pynvml
    setup_rapids()
    
    from cuml import UMAP
    
except ModuleNotFoundError as e:
    print(e)
    print('Not in colab environment, continuing to run locally')
    from umap import UMAP

In [None]:
import tensorflow as tf
import tabnet
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import LeaveOneOutEncoder
from sklearn.compose import ColumnTransformer

In [None]:
from lib.processing import load_from_pkl
from data.util.paths import DATA_PATH
from lib.pipelines import *

In [None]:
api_df, extracted_df = load_from_pkl('api',DATA_PATH), load_from_pkl('extracted',DATA_PATH)

In [None]:
api_df = api_pipe.fit_transform(api_df)

In [None]:
extracted_df = extracted_pipe.fit_transform(extracted_df)

In [None]:
with np.load(os.path.join(DATA_PATH,'high_level_features_labelled.npz')) as data:
    image_embedding_df = pd.concat([pd.DataFrame(data[section]) for section in ('release_id','bitmap','features')],axis=1)
    image_embedding_df.columns = ['release_id', 'bitmap'] + ['feature_%s' % i for i in range(1,1281)]

In [None]:
scaler = StandardScaler()
pca = PCA(n_components=10)
image_embeddings_scaled = scaler.fit_transform(image_embedding_df.loc[:,['feature_%s' % i for i in range(1,1281)]])

In [None]:
image_embeddings_reduced = pca.fit_transform(image_embeddings_scaled)

In [None]:
image_embeddings_reduced = pd.concat([
      image_embedding_df.loc[:,'release_id'],
      pd.DataFrame(
          image_embeddings_reduced,
          columns = ['images_umap_%s' % i for i in range(image_embeddings_reduced.shape[1])]
      )],
      axis=1
)


Combining datasets

In [None]:
df = api_df.merge(extracted_df,how='inner',on='release_id')
df = df.merge(image_embeddings_reduced,how='inner',on='release_id')

In [None]:
del api_df, extracted_df, image_embedding_df, image_embeddings_scaled, image_embeddings_reduced

In [None]:
record_store_tabnet_transformer = ColumnTransformer(transformers=[
    ('year_encoder', OneHotEncoder(dtype=np.uint8,handle_unknown='ignore'), ['year']),
    ('format_name', OneHotEncoder(dtype=np.uint8,handle_unknown='ignore'), ['format_name'])
], remainder='passthrough')

In [None]:
record_store_tabnet_removal_columns = [
    'market_price','units_for_sale','have','want','average_rating','rating_count','last_sold','lowest','median',
    'highest','track_titles','country','genre','style','community_have','community_want','formats','thumb_url',
    'release_url','format_description','days_since_last_sale','title',
]

In [None]:
record_store_tabnet_pipe = Pipeline([
    ('running_time_imputer',RunningTimeImputer('running_time','number_of_tracks')),
    ('leave_one_out_encoding', LeaveOneOutEncoder(cols=['artist','label','format_text','master_id'])),
    ('record_store_column_remover', ColumnRemover(record_store_tabnet_removal_columns)),
    ('preprocessing',record_store_tabnet_transformer),
    ('scaler', StandardScaler()),
])

In [None]:
df_arr = record_store_tabnet_pipe.fit_transform(df.drop('market_value',axis=1),df.market_value)

In [None]:
df_pretensor = np.concatenate([df_arr,np.log(df['market_value']).values.reshape(-1,1)],axis=1)
np.random.shuffle(df_pretensor)

In [None]:
ds = tf.data.Dataset.from_tensor_slices(df_pretensor)

In [None]:
def transform(ds):
    unstacked = tf.unstack(ds)
    features = unstacked[:-1]
    
    x = dict(zip(col_names,features))
    y = unstacked[-1]
    
    return x,y

In [None]:
col_names = ['column_%s' % i for i in range(975)]

In [None]:
train_size = int(0.8*len(df))
BATCH_SIZE = 1000
ds_train = ds.take(train_size)
ds_train = ds_train.map(transform)
ds_train = ds_train.batch(BATCH_SIZE)

In [None]:
ds_test = ds.skip(train_size)
ds_test = ds_test.map(transform)
ds_test = ds_test.batch(BATCH_SIZE)

In [None]:
feature_columns = [tf.feature_column.numeric_column(column) for column in col_names]

In [None]:
model = tabnet.TabNetRegressor(
    feature_columns,
    feature_dim=64,
    output_dim=64,
    num_regressors=len(feature_columns),
    num_decision_steps=2,
    relaxation_factor=1,
    sparsity_coefficient=1e-2,    
)

In [None]:
lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01,decay_steps=100,decay_rate=0.9,staircase=False)
optimizer = tf.keras.optimizers.Adam(lr)

In [None]:
model.compile(optimizer,loss='mean_squared_error',metrics=['mean_absolute_error'])

In [None]:
model.fit(ds_train,epochs=50,validation_data=ds_test,verbose=100)