In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
import re
import random
from sklearn import preprocessing

from summary_fn import *

In [2]:
# Read data 
train = pd.read_csv('train.csv')
member = pd.read_csv('members.csv',parse_dates=['registration_init_time','expiration_date'])
songs = pd.read_csv('songs.csv')
extra_song = pd.read_csv('song_extra_info.csv')

In [3]:
# Correction in column type
member.city = member.city.astype('category')
member.registered_via = member.registered_via.astype('category')
songs.language = songs.language.astype('category')

### Note we will split out data set into train/validation and test set. Will not use the test set given by Kaggle

In [4]:
# Merge all data files
train = train.merge(songs, how='left', on='song_id')
train = train.merge(member, how='left', on='msno')
train = train.merge(extra_song, how='left', on='song_id')

In [5]:
del songs
del member
del extra_song
del genre_count

In [6]:
# Feature engineering functions
def add_datepart(x, fldname, drop=True, time=False):
    "Helper function that adds columns relevant to a date."
    df = x.copy()
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)
    return df

# Using isrc to extract years
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

# Counted features

def gener_id_count(x):
    if pd.isnull(x):
        return -1
    else:
        return x.count('|')+1

def lyricist_count(x):
    if pd.isnull(x):
        return -1
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def composer_count(x):
    if pd.isnull(x):
        return -1
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def artist_count(x):
    if pd.isnull(x):
        return -1
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0



In [7]:
# Fetaure addition 
def add_days_left(train):
    train['days_left'] = (train.expiration_date - train.registration_init_time).dt.days.astype('int')
    return train

def add_gener_count(train):
    train['gener_count'] = train['genre_ids'].apply(gener_id_count).astype(np.int8)
    return train

def add_lyricist_count(train):
    train['lyricist_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
    return train


def add_composer_count(train):
    train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
    return train


def add_artist_count(train):
    train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
    return train

def add_featured_song(train):
    train['features'] = train['artist_name'].apply(is_featured).astype(np.int8)
    return train


def add_song_year(train):
    train['song_year'] = train['isrc'].apply(isrc_to_year)
    train.drop(['isrc', 'name'], axis = 1, inplace = True)
    return train


def add_song_play_count(train):
    song_count = song_play_times(train['song_id'])
    song_count_df = pd.DataFrame.from_dict(song_count,orient='index',columns=['song_play_counts'])
    song_count_df.reset_index(level=0, inplace=True)
    song_count_df = song_count_df.rename(columns={'index':'song_id'})
    train = train.merge(song_count_df,how='left',on='song_id')
    return train

def add_artist_played_count(train):
    artist_count = pd.DataFrame.from_dict(song_play_times(train['artist_name']),
                                      orient='index',columns=['artist_song_count']).reset_index()
    artist_count = artist_count.rename(columns={'index':'artist_name'})
    train = train.merge(artist_count,how='left',on='artist_name')
    return train

def add_msno_appear_count(train):
    msno_count = pd.DataFrame.from_dict(song_play_times(train['msno']),
                                      orient='index',columns=['msno_appear_count']).reset_index()
    msno_count = msno_count.rename(columns={'index':'msno'})
    train = train.merge(msno_count,how='left',on='msno')
    return train    
    
    
def add_datepart_reg(train):
    train = add_datepart(train, 'registration_init_time')
    return train

def add_datepart_exp(train):
    train = add_datepart(train,'expiration_date')
    return train

def count_and_percent(df, colnames:list):
    for i in colnames:
        counter = pd.DataFrame.from_dict(Counter(df[i]),
                       orient='index', columns=[i+'_count']).reset_index()
        counter.columns = [i,i+'_count']
        df = df.merge(counter, how='left',on=i)
        agg = df.groupby(by=['msno',i]).agg({'song_id':['count']})
        agg.columns = agg.columns.get_level_values(0)
        agg.columns = [i+'_user_lev_c']
        df = df.merge(agg,how='left',on=['msno',i])
    return df

def split_gener_columns(train):
    df = train.copy()
    df['genre_ids'] = df['genre_ids'].astype(str)
    df = pd.concat([df,df['genre_ids'].str.split('|',expand=True)],axis=1)
    df = df.drop(columns='genre_ids',axis= 1)
    return df

In [8]:
# Apply features form pipeline
def apply_features(train, feature_list):
    for i in feature_list:
        train = i(train)
    return train



In [9]:
# Fill NA values
def fillna_nan(df, cat_list, contlist):
    train = df.copy()
    for col in cat_list:
        train[col] = train[col].fillna('nan')
    for col in contlist:
        train[col] = train[col].fillna(-1)
    return train



In [10]:
# Encoding the data 
def encoder(train):
    le = preprocessing.LabelEncoder()
    for column_name in train.columns:
            if train[column_name].dtype.name in ['category', 'object']:
                train[column_name] = le.fit_transform(train[column_name].astype(str))
            else:
                pass
    

In [11]:
# Applying pipeline

features_pipeline = [add_days_left, add_datepart_reg, add_datepart_exp, add_gener_count, 
                     add_lyricist_count,add_composer_count, add_artist_count, add_featured_song, 
                     add_song_year,add_song_play_count, add_artist_played_count, add_msno_appear_count]

x = apply_features(train, features_pipeline)
del train
# Few more features
collist = ['source_system_tab', 'source_screen_name', 'source_type', 'artist_name',
       'composer', 'lyricist']
x = count_and_percent(x,collist)
x = split_gener_columns(x)


In [17]:
# testing add_datepart
assert 'expiration_Is_month_end' in x

#testing isrc_to_year
assert 'expiration_Year' in x
assert len(x['expiration_Year']) != 0

#testing gener_id_count
assert 'gener_count' in x
assert len(x['gener_count']) != 0

#testing composer_count
assert 'composer_count_x' in x
assert len(x['composer_count_x']) != 0

#testing lyricist_count
assert 'lyricist_count_x' in x
assert len(x['lyricist_count_x']) != 0

#testing is_feat
assert 'features' in x
assert len(x['features']) != 0

#testing artist_count
assert 'artist_count' in x
assert len(x['artist_count']) != 0

In [21]:
#testing add_days_left
assert 'days_left' in x
assert len(x['days_left']) != 0

#testing add_song_play_count
assert 'song_play_counts' in x
assert len(x['song_play_counts']) != 0

#testing add_artist_played_count
assert 'artist_song_count' in x
assert len(x['artist_song_count']) != 0

#testing add_msno_appear_count
assert 'msno_appear_count' in x
assert len(x['msno_appear_count']) != 0

In [22]:
# Data imputation block

cat_nan_list = ['msno', 'song_id', 'source_screen_name', 
    'source_system_tab', 'source_type', 
    'artist_name', 'composer', 'lyricist', 'gender']
cont_nan_list = ['song_length','language', 'song_year']

x = fillna_nan(x, cat_nan_list, cont_nan_list)

In [23]:
x.to_csv('features_train_data.csv',index= False)

In [24]:
x = pd.read_csv('features_train_data.csv')
x.head()

In [25]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [26]:
display_all(x.head().T)

Unnamed: 0,0,1,2,3,4
msno,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=
song_id,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=
source_system_tab,explore,my library,my library,my library,explore
source_screen_name,Explore,Local playlist more,Local playlist more,Local playlist more,Explore
source_type,online-playlist,local-playlist,local-playlist,local-playlist,online-playlist
target,1,1,1,1,1
song_length,206471,284584,225396,255512,187802
artist_name,Bastille,Various Artists,Nas,Soundway,Brett Young
composer,Dan Smith| Mark Crew,,N. Jones、W. Adams、J. Lordan、D. Ingle,Kwadwo Donkoh,Brett Young| Kelly Archer| Justin Ebach
lyricist,,,,,


In [27]:
# Train test validation split
X = x.drop(columns='target',axis=1)
y = x['target']

X = X.fillna(-1)

X_train_all, X_test, y_train_all, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=1)

In [28]:
# Sampled train data 
random.seed(130)
idx = random.sample(range(0,X_train.shape[0]), 500000)
X_train_sampled = X_train.iloc[idx]
y_train_sampled = y_train.iloc[idx]