In [30]:
# Import required libraries
import sklearn
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [31]:
# Read data
df = pd.read_csv('../kkbox/input/train.csv')

# Take 10% sample of items
df = df.sample(frac=0.1)

# Read songs data and join the data with df
songs = pd.read_csv('../kkbox/input/songs.csv')
df = pd.merge(df, songs, on='song_id', how='left')
del songs

# Read members data and join the data with df
members = pd.read_csv('../kkbox/input/members.csv')
df = pd.merge(df, members, on='msno', how='left')
del members

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 737742 entries, 0 to 737741
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   msno                    737742 non-null  object 
 1   song_id                 737742 non-null  object 
 2   source_system_tab       735317 non-null  object 
 3   source_screen_name      696755 non-null  object 
 4   source_type             735633 non-null  object 
 5   target                  737742 non-null  int64  
 6   song_length             737731 non-null  float64
 7   genre_ids               725878 non-null  object 
 8   artist_name             737731 non-null  object 
 9   composer                570581 non-null  object 
 10  lyricist                420107 non-null  object 
 11  language                737727 non-null  float64
 12  city                    737742 non-null  int64  
 13  bd                      737742 non-null  int64  
 14  gender              

In [32]:
# Count Na in %
df.isnull().sum()/df.isnull().count()*100

msno                       0.000000
song_id                    0.000000
source_system_tab          0.328706
source_screen_name         5.555736
source_type                0.285872
target                     0.000000
song_length                0.001491
genre_ids                  1.608150
artist_name                0.001491
composer                  22.658463
lyricist                  43.055025
language                   0.002033
city                       0.000000
bd                         0.000000
gender                    40.082170
registered_via             0.000000
registration_init_time     0.000000
expiration_date            0.000000
dtype: float64

In [33]:
# Replace NA
for i in df.select_dtypes(include=['object']).columns:
    df[i][df[i].isnull()] = 'unknown'
df = df.fillna(value=0)

In [34]:
# Create Dates

# registration_init_time
df.registration_init_time = pd.to_datetime(df.registration_init_time, format='%Y%m%d', errors='ignore')
df['registration_init_time_year'] = df['registration_init_time'].dt.year
df['registration_init_time_month'] = df['registration_init_time'].dt.month
df['registration_init_time_day'] = df['registration_init_time'].dt.day

# expiration_date
df.expiration_date = pd.to_datetime(df.expiration_date,  format='%Y%m%d', errors='ignore')
df['expiration_date_year'] = df['expiration_date'].dt.year
df['expiration_date_month'] = df['expiration_date'].dt.month
df['expiration_date_day'] = df['expiration_date'].dt.day

# Dates to categoty
df['registration_init_time'] = df['registration_init_time'].astype('category')
df['expiration_date'] = df['expiration_date'].astype('category')

# Object data to category
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')
    
# Encoding categorical features
for col in df.select_dtypes(include=['category']).columns:
    df[col] = df[col].cat.codes

In [35]:
# Drop columns
df = df.drop(['expiration_date', 'lyricist', 'gender'], 1)

In [36]:
model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, min_child_weight=10, n_estimators=250)

model.fit(df[df.columns[df.columns != 'target']], df.target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=10, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
df_plot = pd.DataFrame({'features': df.columns[df.columns != 'target'],
                        'importances': model.feature_importances_})
df_plot = df_plot.sort_values('importances', ascending=False)

In [38]:
df = df.drop(df_plot.features[df_plot.importances < 0.04].tolist(), 1)
df.columns

Index(['source_system_tab', 'source_type', 'target', 'expiration_date_year'], dtype='object')

In [39]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)

model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, min_child_weight=10, n_estimators=250)
for train_indices,val_indices in kf.split(df) :
    model.fit(df.drop(['target'],axis=1).loc[train_indices,:],df.loc[train_indices,'target'])

model.score(df[df.columns[df.columns != 'target']], df.target)#training accuracy

0.6249867839976577

In [40]:
# Split the data into training set and testing set
target = df.pop('target')
train_data, test_data, train_labels, test_labels = sklearn.model_selection.train_test_split(df, target, test_size = 0.3)


In [41]:
# Create model
model2 = xgb.XGBClassifier(learning_rate=0.1, max_depth=15, min_child_weight=5, n_estimators=250)
model2.fit(train_data, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=250, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [42]:
# Predicting
predict_labels = model2.predict(test_data)

In [43]:
print(sklearn.metrics.classification_report(test_labels, predict_labels))

              precision    recall  f1-score   support

           0       0.61      0.68      0.64    110053
           1       0.64      0.57      0.60    111270

    accuracy                           0.62    221323
   macro avg       0.63      0.63      0.62    221323
weighted avg       0.63      0.62      0.62    221323

