In [3]:
from summary_fn import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display
from sklearn import metrics
from sklearn.metrics import accuracy_score
import re
import random
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Imputer

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Import data
data = pd.read_csv('features_train_data.csv')

### Feature engineering for random forest


In [5]:
# Normalize data
hnorm_cols = ['msno_genre_count', 'source_system_tab_user_lev_c', 'source_screen_name_user_lev_c',
              'source_type_user_lev_c', 'composer_user_lev_c', 'lyricist_user_lev_c', 'artist_name_user_lev_c']

for i in hnorm_cols:
    data[i] = data[i]/data['msno_appear_count']

In [6]:
# Fill NA's before Normalization
data.gener_count = data.gener_count.fillna(-1)

In [7]:
# Drop less relevant columns
data.drop(columns=['lyricist_count_y', 'composer_count_x',
                   'gener_count'], inplace=True)

### Imputation

In [8]:
# Impute continuous columns
na_continuous_col = ['msno_genre_count', 'source_system_tab_user_lev_c', 'source_screen_name_user_lev_c',
                     'source_type_user_lev_c', 'artist_name_user_lev_c', 'composer_user_lev_c',
                     'lyricist_user_lev_c']
for col in na_continuous_col:
    data[col] = data[col].fillna(-1)

In [9]:
# Impute categorical columns
data = data.fillna('NaN')

In [10]:
# Label encoding for the categorical varaibles
from sklearn import preprocessing


def encoder(x_train):
    le = preprocessing.LabelEncoder()
    for column_name in x_train.columns:
        if x_train[column_name].dtype.name in ['category', 'object']:
            x_train[column_name] = le.fit_transform(
                x_train[column_name].astype(str))


# Encode the data
encoder(data)

## Model fitting with Feature Set 1

In [11]:
# Features and target varaibles
X = data.drop(columns='target', axis=1)
y = data['target']

# Train and test split
X_train_all, X_test, y_train_all, y_test = train_test_split(
    X, y, test_size=0.2)

# Train and validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_train_all, y_train_all, test_size=0.2)

In [12]:
import gc
del data
del X_train_all, y_train_all
gc.collect()

14

In [13]:
# Sampled data
idx = random.sample(range(0, X_train.shape[0]), 100000)
X_train_sampled = X_train.iloc[idx]
y_train_sampled = y_train.iloc[idx]

In [14]:
del X_train
del y_train
gc.collect()

0

In [16]:
# Base line model for Logistic Regression
from sklearn.metrics import accuracy_score
m = LogisticRegression()
m.fit(X_train_sampled, y_train_sampled)
predicted = m.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score validation: {accuracy:.3}')

Mean accuracy score validation: 0.63


In [15]:
# Base line model Random Forest
from sklearn.metrics import accuracy_score
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train_sampled, y_train_sampled)
predicted = m.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score validation: {accuracy:.3}')

Mean accuracy score validation: 0.626


In [16]:
# Base line model for KNN
from sklearn.metrics import accuracy_score
m = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
m.fit(X_train_sampled, y_train_sampled)
predicted = m.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score validation: {accuracy:.3}')

Mean accuracy score validation: 0.602


In [18]:
# Base Line model for LGBM
sk_reg = lgb.sklearn.LGBMClassifier(
    objective='binary',
    eval_metric='binary_logloss',
    boosting='gbdt',
    learning_rate=0.3,
    verbose=0,
    num_leaves=600,
    bagging_freq=1,
    feature_fraction=0.9,
    max_bin=256,
    max_depth=300,
    num_rounds=200,
)


sk_reg.fit(X_train_sampled, y_train_sampled)
predicted = sk_reg.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.657


In [23]:
del X_train_sampled
del y_train_sampled
del X_val
del y_val
del X_test
del y_test
gc.collect()

427

# Model fitting fit features set 2

In [24]:
# Import data with feature set2
X_train = pd.read_csv('Helper_functions/X_train.csv')
X_val = pd.read_csv('Helper_functions/X_val')

In [25]:
y_train = X_train['y_train']
X_train = X_train.drop(columns='y_train', axis=1)
y_val = X_val['y_val']
X_val = X_val.drop(columns='y_val', axis=1)

In [26]:
# Sampled train data
idx = random.sample(range(0, X_train.shape[0]), 100000)
y_train = pd.DataFrame(y_train)
X_train_sampled = X_train.iloc[idx]
y_train_sampled = y_train.iloc[idx]

In [27]:
del X_train
del y_train
gc.collect()

14

In [30]:
for col in X_train_sampled.columns:
    if X_train_sampled[col].dtype == object:
        X_train_sampled[col] = X_train_sampled[col].astype('category')

for col in X_val.columns:
    if X_val[col].dtype == object:
        X_val[col] = X_val[col].astype('category')

In [31]:
# Base Line model for LGBM
sk_reg = lgb.sklearn.LGBMClassifier(
    objective='binary',
    eval_metric='binary_logloss',
    boosting='gbdt',
    learning_rate=0.3,
    verbose=0,
    num_leaves=600,
    bagging_freq=1,
    feature_fraction=0.9,
    max_bin=256,
    max_depth=300,
    num_rounds=200,
)


sk_reg.fit(X_train_sampled, y_train_sampled)
predicted = sk_reg.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.641


In [33]:
# Encoding categorical variables
encoder(X_train_sampled)
encoder(X_val)

In [36]:
# Impute NA values with -1
X_train_sampled = X_train_sampled.fillna(-1)
X_val = X_val.fillna(-1)

In [37]:
# Base line model fro Linear Regression
from sklearn.metrics import accuracy_score
m = LogisticRegression()
m.fit(X_train_sampled, y_train_sampled)
predicted = m.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score validation: {accuracy:.3}')

Mean accuracy score validation: 0.583


In [38]:
# Base line model Random Forest
from sklearn.metrics import accuracy_score
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train_sampled, y_train_sampled)
predicted = m.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score validation: {accuracy:.3}')

Mean accuracy score validation: 0.567


In [39]:
# Base line model for KNN
from sklearn.metrics import accuracy_score
m = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
m.fit(X_train_sampled, y_train_sampled)
predicted = m.predict(X_val)
accuracy = accuracy_score(y_val, predicted)
print(f'Mean accuracy score validation: {accuracy:.3}')

Mean accuracy score validation: 0.538
