In [127]:
# Import all import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from summary_fn import *

In [None]:
# Reading all the data files
train = pd.read_csv('train.csv')
member = pd.read_csv('members.csv',parse_dates=['registration_init_time','expiration_date'])
songs = pd.read_csv('songs.csv')
extra_song = pd.read_csv('song_extra_info.csv')
test = pd.read_csv('test.csv')

In [6]:
train.dtypes

msno                  object
song_id               object
source_system_tab     object
source_screen_name    object
source_type           object
target                 int64
dtype: object

In [12]:
member.dtypes

msno                              object
city                               int64
bd                                 int64
gender                            object
registered_via                     int64
registration_init_time    datetime64[ns]
expiration_date           datetime64[ns]
dtype: object

In [None]:
# Converting city and registered_via into category
member.city = member.city.astype('category')
member.registered_via = member.registered_via.astype('category')

In [None]:
member.dtypes

In [13]:
songs.dtypes

song_id         object
song_length      int64
genre_ids       object
artist_name     object
composer        object
lyricist        object
language       float64
dtype: object

In [None]:
# Converting language to category
songs.language = songs.language.astype('category')

In [129]:
# Function to convert all object columns to category type
def convert_obj_cat(df):
    obj_cols = df.select_dtypes(include='object').columns
    for col in obj_cols:
        df[col] = df[col].astype('category')

In [130]:
convert_obj_cat(train)
convert_obj_cat(test)
convert_obj_cat(songs)
convert_obj_cat(member)
convert_obj_cat(extra_song)

In [131]:
# Merging all required files into a single file
train = train.merge(songs, how='left', on='song_id')
test = test.merge(songs, how='left', on='song_id')


In [132]:
train = train.merge(member, how='left', on='msno')
test = test.merge(member, how='left', on='msno')

In [133]:
train = train.merge(extra_song, how='left', on='song_id')
test = test.merge(extra_song, how='left', on='song_id')

In [134]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,city,bd,gender,registered_via,registration_init_time,expiration_date,name,isrc
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,1,0,,7,2012-01-02,2017-10-05,Good Grief,GBUM71602854
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,13,24,female,9,2011-05-25,2017-09-11,Lords of Cardboard,US3C69910183
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,13,24,female,9,2011-05-25,2017-09-11,Hip Hop Is Dead(Album Version (Edited)),USUM70618761
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,13,24,female,9,2011-05-25,2017-09-11,Disco Africa,GBUQH1000063
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,1,0,,7,2012-01-02,2017-10-05,Sleep Without You,QM3E21606003


In [48]:
count_na(train)

Unnamed: 0,NA_count,Percentage
msno,0,0.0
song_id,0,0.0
source_system_tab,24849,0.336825
source_screen_name,414804,5.622618
source_type,21539,0.291959
target,0,0.0
song_length,114,0.001545
genre_ids,118455,1.605643
artist_name,114,0.001545
composer,1675706,22.71399


In [49]:
count_na(test)

Unnamed: 0,NA_count,Percentage
id,0,0.0
msno,0,0.0
song_id,0,0.0
source_system_tab,8442,0.33018
source_screen_name,162883,6.370605
source_type,7297,0.285397
song_length,25,0.000978
genre_ids,42110,1.646987
artist_name,25,0.000978
composer,619304,24.221935


In [135]:
# Feature extractions
train['days_left'] = (train.expiration_date - train.registration_init_time).dt.days.astype('int')

In [136]:
train['reg_year'] = train['registration_init_time'].dt.year
train['reg_month'] = train['registration_init_time'].dt.month
train['reg_date'] = train['registration_init_time'].dt.day

train['exp_year'] = train['expiration_date'].dt.year
train['exp_month'] = train['expiration_date'].dt.month
train['exp_date'] = train['expiration_date'].dt.day

In [137]:
# Dropping expiration date and registration date columns
train.drop(columns=['registration_init_time','expiration_date'],index=1,inplace=True)

In [138]:
# Using isrc to extract years
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
train['song_year'] = train['isrc'].apply(isrc_to_year)

# Dropping isrc and name
train.drop(['isrc', 'name'], axis = 1, inplace = True)


In [139]:
# Counted features

def gener_id_count(x):
    if pd.isnull(x):
        return x
    else:
        return x.count('|')+1

def lyricist_count(x):
    if pd.isnull(x):
        return x
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def composer_count(x):
    if pd.isnull(x):
        return x
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def artist_count(x):
    if pd.isnull(x):
        return x
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0
    

In [140]:
train['gener_count'] = train['genre_ids'].apply(gener_id_count).astype(np.int8)
train['lyricist_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
train['features'] = train['artist_name'].apply(is_featured).astype(np.int8)


In [106]:
# Number of times a song is played
song_count = song_play_times(train['song_id'])

In [111]:
song_count_df = pd.DataFrame.from_dict(song_count,orient='index',columns=['counts'])
song_count_df.head()

Unnamed: 0,counts
BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,215
JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,4
2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,1
3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,412
3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1108


In [112]:
song_count_df.reset_index(level=0, inplace=True)

In [117]:
song_count_df = song_count_df.rename(columns={'index':'song_id'})
song_count_df.head()

Unnamed: 0,song_id,counts
0,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,215
1,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,4
2,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,1
3,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,412
4,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,1108


In [141]:
train = train.merge(song_count_df,how='left',on='song_id')

In [145]:
check_point = train

In [148]:
#Counting artist counts similarly
artist_count = pd.DataFrame.from_dict(song_play_times(train['artist_name']),
                                      orient='index',columns=['artist_song_count']).reset_index()
artist_count = artist_count.rename(columns={'index':'artist_name'})

In [149]:
train = train.merge(artist_count,how='left',on='artist_name')

In [150]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,exp_month,exp_date,song_year,gener_count,lyricist_count,composer_count,artist_count_x,features,counts,artist_count_y
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,...,10,5,2016,1,1,2,1,0,215,1140
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,...,9,11,2006,1,1,1,1,0,4,289
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,...,9,11,2010,1,1,1,1,0,1,1
3,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,...,10,5,2016,1,1,3,1,0,412,427
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=,explore,Explore,online-playlist,1,247803.0,1259,Desiigner,Sidney Selby| Adnan Khan,...,10,5,2016,1,1,2,1,0,1108,1692


In [151]:
count_na(train)

Unnamed: 0,NA_count,Percentage
msno,0,0.0
song_id,0,0.0
source_system_tab,24849,0.336825
source_screen_name,414804,5.622618
source_type,21539,0.291959
target,0,0.0
song_length,114,0.001545
genre_ids,118455,1.605643
artist_name,114,0.001545
composer,1675705,22.71398


In [179]:
# Building models
y_train = train['target']
x_train = train.drop(['target'], axis=1)
convert_obj_cat(x_train)

In [180]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in x_train.columns:
        if x_train[column_name].dtype.name == 'category':
            x_train[column_name] = le.fit_transform(x_train[column_name].astype(str))
        else:
            pass

In [185]:
x_train = x_train.fillna(-1)

In [189]:
#x_train.dropna(inplace=True)
x_train.shape

(7377417, 30)

In [186]:
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.8, random_state=1)


In [196]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, oob_score=False, random_state=1,n_jobs=-1)
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [197]:
from sklearn.metrics import accuracy_score
predicted = rf.predict(X_test)
accuracy = accuracy_score(Y_test, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.691
