In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import Imputer
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.svm.libsvm import predict_proba
import lightgbm as lgb

# Import

In [2]:
members = pd.read_csv('members.csv')
songs = pd.read_csv('songs.csv')
song_extra = pd.read_csv('song_extra_info.csv')
train = pd.read_csv('train.csv')
test_sub = pd.read_csv('test.csv')

df = pd.merge(train, songs[['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']], 
              on='song_id', how='left')

test_sub = pd.merge(test_sub, songs[['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']], 
              on='song_id', how='left')

# Getting registration/expiration year, month, and date

In [3]:
# Convert registration init_time to understandable values (year, month, date)
members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))
members['registration_month'] = members['registration_init_time'].apply(lambda x: int(str(x)[4:6]))
members['registration_date'] = members['registration_init_time'].apply(lambda x: int(str(x)[6:8]))

# Convert expiration init_time to understandable values (year, month, date)
members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members['expiration_month'] = members['expiration_date'].apply(lambda x: int(str(x)[4:6]))
members['expiration_date'] = members['expiration_date'].apply(lambda x: int(str(x)[6:8]))
members = members.drop(['registration_init_time'], axis=1)

# Merging member into the train set

In [4]:
df = pd.merge(df, members, on='msno', how='left')
test_sub = pd.merge(test_sub, members, on='msno', how='left')

# Extracting the song release year from isrc

In [5]:
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
song_extra['song_year'] = song_extra['isrc'].apply(isrc_to_year)
song_extra.drop(['isrc', 'name'], axis = 1, inplace = True)

# Merging song_extra into the train set

In [6]:
df = pd.merge(df, song_extra, on='song_id', how='left')
test_sub = pd.merge(test_sub, song_extra, on='song_id', how='left')

# Impute values into testing set

In [7]:
# create imputer object
imp = Imputer(missing_values = 'NaN', strategy = 'median')
# impute for Ozone
test_sub['song_length'] = imp.fit_transform(test_sub[['song_length']])
test_sub['language'] = imp.fit_transform(test_sub[['language']])
test_sub['song_year'] = imp.fit_transform(test_sub[['song_year']])

# converting data types

In [8]:
to_cat = ['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'artist_name', 'genre_ids', 'language', 'city', 'gender',
       'registered_via']
for col in to_cat:
    df[col] = df[col].astype('category')
    test_sub[col] = test_sub[col].astype('category')

# More Feature Engineering

In [9]:
#feature engineering
#Create a column for is_featured
def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0
df['is_featured'] = df['artist_name'].apply(is_featured).astype(np.int8)
test_sub['is_featured'] = test_sub['artist_name'].apply(is_featured).astype(np.int8)


#Artist count
def artist_count(x):
    if x == 'no_artist':
        return 0
    else:
        return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

df['artist_count'] = df['artist_name'].apply(artist_count).astype(np.int8)
test_sub['artist_count'] = test_sub['artist_name'].apply(artist_count).astype(np.int8)

# number of times a song has been played before
_dict_count_song_played_train = {k: v for k, v in df['song_id'].value_counts().iteritems()}
_dict_count_song_played_test = {k: v for k, v in test_sub['song_id'].value_counts().iteritems()}
def count_song_played(x):
    try:
        return _dict_count_song_played_train[x]
    except KeyError:
        try:
            return _dict_count_song_played_test[x]
        except KeyError:
            return 0

df['count_song_played'] = df['song_id'].apply(count_song_played).astype(np.int64)
test_sub['count_song_played'] = test_sub['song_id'].apply(count_song_played).astype(np.int64)

# number of times the artist has been played
_dict_count_artist_played_train = {k: v for k, v in df['artist_name'].value_counts().iteritems()}
_dict_count_artist_played_test = {k: v for k, v in test_sub['artist_name'].value_counts().iteritems()}
def count_artist_played(x):
    try:
        return _dict_count_artist_played_train[x]
    except KeyError:
        try:
            return _dict_count_artist_played_test[x]
        except KeyError:
            return 0

df['count_artist_played'] = df['artist_name'].apply(count_artist_played).astype(np.int64)
test_sub['count_artist_played'] = test_sub['artist_name'].apply(count_artist_played).astype(np.int64)

# Popping Target and dropping NaN's

In [10]:
ids = test_sub.pop('id')
df.dropna(inplace = True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3814669 entries, 1 to 7377413
Data columns (total 25 columns):
msno                   category
song_id                category
source_system_tab      category
source_screen_name     category
source_type            category
target                 int64
artist_name            category
genre_ids              category
song_length            float64
language               category
city                   category
bd                     int64
gender                 category
registered_via         category
expiration_date        int64
registration_year      int64
registration_month     int64
registration_date      int64
expiration_year        int64
expiration_month       int64
song_year              float64
is_featured            int8
artist_count           int8
count_song_played      int64
count_artist_played    int64
dtypes: category(11), float64(2), int64(10), int8(2)
memory usage: 470.6 MB


# RF Feature Importance and OOB Score

In [12]:
model_df = df.sample(math.floor(len(df)/2))

In [13]:
model_df['RANDOM'] = np.random.rand(len(model_df))*100

# model
forest = RandomForestClassifier(oob_score = True, n_estimators = 500, n_jobs = 2)

# train
forest.fit(model_df.drop('target', 1), model_df['target'])

# feature importances
print(pd.DataFrame({'Importance': forest.feature_importances_}, 
                   index = model_df.drop('target', 1).columns).sort_values('Importance', 
                                                                             ascending = False))

# Random column is no longer necessary
model_df.drop('RANDOM', 1, inplace = True)

# Out of bag score
forest.oob_score_

ValueError: could not convert string to float: 'female'

# Train/Test Sets

In [12]:
train, test = train_test_split(model_df, test_size = 0.1)

# train sets
train_x = train.drop('target', 1)
train_y = train['target'].values

# test sets
test_x = test.drop('target', 1)
test_y = test['target'].values

# Boosting AUC

In [None]:
# model
boost = GradientBoostingClassifier(n_estimators = 70)

# train
boost.fit(train_x, train_y)

# predict
boost_predictions = boost.predict(test_x)

# AUC score
print(roc_auc_score(test_y, boost_predictions), '\n')

# feature importances
print(pd.DataFrame({'Importance': boost.feature_importances_}, 
                   index = train_x.columns).sort_values('Importance', ascending = False))

# confusion matrix
pd.crosstab(test_y, boost_predictions, rownames=['Actual'], colnames = ['Predicted:'], margins = True)

# LightGBM

In [None]:
lgb_train = lgb.Dataset(train_x, label = train_y)

params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_train,
                early_stopping_rounds=5)

lgbm_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)

roc_auc_score(test_y, lgbm_pred)

# Models based on gender/language

In [17]:
def lgbm_train(data, col_name, attribute):
    data = data.loc[data[col_name] == attribute]
    train_x_g = data.drop(['target', col_name], 1)
    train_y_g = data['target'].values
    
    lgb_train_g = lgb.Dataset(train_x_g, label = train_y_g)

    params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'dart',
            'learning_rate': 0.3 ,
            'verbose': 0,
            'num_leaves': 108,
            'bagging_fraction': 0.95,
            'bagging_freq': 1,
            'bagging_seed': 1,
            'feature_fraction': 0.9,
            'feature_fraction_seed': 1,
            'max_bin': 256,
            'max_depth': 10,
            'num_rounds': 200,
            'metric' : 'auc'}

    gbm = lgb.train(params,
                    lgb_train_g,
                    num_boost_round=100,
                    valid_sets=lgb_train_g,
                    early_stopping_rounds=5)
    return gbm

### Putting models in hash table

### Gender…

In [None]:
gender_models = {}
for gen in model_df['gender'].unique():
    gender_models[gen] = lgbm_train(df, gender, gen)

### Language…

In [20]:
language_models = {}
for lang in model_df['language'].unique():
    language_models[lang] = lgbm_train(df, 'language', lang)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	training's auc: 0.738313
Training until validation scores don't improve for 5 rounds.
[2]	training's auc: 0.756061
[3]	training's auc: 0.765407
[4]	training's auc: 0.770733
[5]	training's auc: 0.77689
[6]	training's auc: 0.780774
[7]	training's auc: 0.786566
[8]	training's auc: 0.787278
[9]	training's auc: 0.79022
[10]	training's auc: 0.791927
[11]	training's auc: 0.793897
[12]	training's auc: 0.793838
[13]	training's auc: 0.79574
[14]	training's auc: 0.797616
[15]	training's auc: 0.799134
[16]	training's auc: 0.800643
[17]	training's auc: 0.802233
[18]	training's auc: 0.803548
[19]	training's auc: 0.804616
[20]	training's auc: 0.805808
[21]	training's auc: 0.806202
[22]	training's auc: 0.807209
[23]	training's auc: 0.807849
[24]	training's auc: 0.808868
[25]	training's auc: 0.810006
[26]	training's auc: 0.811087
[27]	training's auc: 0.812062
[28]	training's auc: 0.81177
[29]	training's auc: 0.812615
[30]	training's auc: 0.813381
[31]	training's auc: 0.81335
[32]	training's auc: 0.

### Tabulating final predictions and exporting to csv for submission
beware, very slow

### Gender…

In [None]:
results = []
males = gender_models['male'].predict(test_sub.drop('gender', 1))
females = gender_models['female'].predict(test_sub.drop('gender', 1))

print('looping…')
for i in range(len(test_sub)):

    if test_sub.iloc[i]['gender'] == 'male':
        results.append(males[i])
    else:
        results.append(females[i])

print('finished looping. Submitting…')
submission = pd.read_csv('sample_submission.csv')
submission['target'] = results
submission.to_csv('lgbm_submission.csv', index=False)

### Language…

In [21]:
lang_predictions = {}
for key in language_models.keys():
    lang_predictions[key] = language_models[key].predict(test_sub.drop('gender', 1))

In [24]:
lang_results = []
for i in range(len(test_sub)):    
    lang_results.append(lang_predictions[test_sub.iloc[i]['language']][i])

print('finished looping. Submitting…')
submission = pd.read_csv('sample_submission.csv')
submission['target'] = lang_results
submission.to_csv('lgbm_submission.csv', index=False)

finished looping. Submitting…


# Final training and exporting for LightGBM (standard models)

In [None]:
lgb_df = lgb.Dataset(df.drop('target', 1), label = df['target'].values)

params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'}

print('training…')
gbm_full = lgb.train(params,
                lgb_df,
                num_boost_round=100,
                valid_sets = lgb_df,
                early_stopping_rounds=5)

print('predicting…')
predictions = gbm_full.predict(test_sub)

print('exporting…')
submission = pd.read_csv('sample_submission.csv')
submission['target'] = predictions
submission.to_csv('lgbm_submission.csv',index=False)

print('done.')

# Final training and exporting (for sklearn models)

In [None]:
def train_export(model):
    model.fit(df.drop('target', 1), df['target'])
    predictions = pd.Series(model.predict(test))
    submission = pd.DataFrame({'target':predictions}, index = ids)
    submission.to_csv('submission.csv')

# train_export(nn)

# Plot

In [None]:
data = [0.57, .62, .44, .48, .66, .67, .66, .66]
plt.plot(data)
plt.plot([0.5, .5, .5, .5, .5, .5, .5, .5], c = 'red')
plt.plot([0.74, .74, .74, .74, .74, .74, .74, .74], c = 'lime')
plt.title('Our Scores Over Time')
plt.xlabel('Submission Number')
plt.ylabel('AUC')
plt.show()
# plt.savefig('auc_graph.png')