In [1]:
from IPython.core.display import HTML, display
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
# Specify the path to the fastai directory
lib_PATH = '/home/ubuntu/fastai/fastai'   # Update this
!cd {lib_PATH}
# Create sym-link to the fastai library
!ln -s {lib_PATH} ./

ln: failed to create symbolic link './fastai': File exists


In [4]:
# Load required modules
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from dateutil.relativedelta import relativedelta

# Load Cleaned Data

In [5]:
train = pd.read_feather('/home/ubuntu/wsmd_music_rec/data/tmp/wsdm_train')
test = pd.read_feather('/home/ubuntu/wsmd_music_rec/data/tmp/wsdm_test')

In [None]:
train.columns[train.isnull().any()]

In [6]:
df, y, nas = proc_df(train, 'target')    # proc_df deals with na values

In [None]:
df.columns[df.isnull().any()]     # no nas left

# Split into Training + Validation Sets

In [7]:
def split_vals(a, n): return a[:n].copy(), a[n:].copy()
# https://www.kaggle.com/kamilkk/i-have-to-say-this
# temporally ordered data - set val. set = test set's length
len_valid = len(test)
n_trn = len(df) - len_valid
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((4820628, 37), (4820628,), (2556790, 37))

# Train the Random Forest

In [8]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    auc_trn   = roc_auc_score(y_train, m.predict(X_train))
    auc_valid = roc_auc_score(y_valid, m.predict(X_valid))
    res = [auc_trn, auc_valid]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print('auc_trn, auc_valid')
    print(res)

In [9]:
m = RandomForestClassifier(n_estimators=200, min_samples_leaf=20, n_jobs=-1, verbose=1)
%time m.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  5.9min


CPU times: user 1h 45min 27s, sys: 32.3 s, total: 1h 46min
Wall time: 6min 50s


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  6.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [10]:
print_score(m)

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    7.3s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:   42.0s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:   49.1s finished
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    3.7s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:   21.1s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:   24.6s finished


auc_trn, auc_valid
[0.76428250379470886, 0.63245587912794277]


# Retrain Model Using All Data

In [11]:
def print_score_final(m):
    auc_trn = roc_auc_score(y, m.predict(df))
    res = [auc_trn]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print('auc_trn')
    print(res)

In [12]:
m_final = RandomForestClassifier(n_estimators=200, min_samples_leaf=20, n_jobs=-1, verbose=1)
%time m_final.fit(df, y)

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  9.5min


CPU times: user 2h 49min 42s, sys: 48.2 s, total: 2h 50min 30s
Wall time: 11min 4s


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 11.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [23]:
print_score_final(m_final)

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   18.1s
[Parallel(n_jobs=16)]: Done  70 out of  70 | elapsed:   47.2s finished


auc_trn
[0.96174800496768165]


# Make Predictions

In [13]:
train_cats(test)

In [14]:
test.dtypes

id                             int64
msno                        category
song_id                     category
source_system_tab           category
source_screen_name          category
source_type                 category
song_length                   uint32
genre_ids                   category
artist_name                 category
composer                    category
lyricist                    category
language                        int8
city                           int64
bd                             int64
gender                      category
registered_via                 int64
expiration_date                int64
membership_days                int64
registration_year              int64
registration_month             int64
registration_date              int64
expiration_year                int64
expiration_month               int64
name                        category
song_year                      int64
genre_ids_count                 int8
lyricists_count                 int8
c

In [15]:
test_df, _, _ = proc_df(test, 'id')

In [16]:
%time predictions = m_final.predict(test_df)

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    3.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:   18.2s


CPU times: user 5min 28s, sys: 2.81 s, total: 5min 31s
Wall time: 22.5 s


[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:   21.5s finished


In [22]:
output = pd.concat([test['id'], pd.DataFrame(predictions)], axis=1)
output.columns = ['id', 'target']   #Changing column names
output.to_csv('predicted.csv', index=False)