In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
#User-Defined Functions
#Finding NAN values in a dataframe
def find_nan(df):
    nan_list = []
    ls = df.isna().any()
    ls = ls.to_dict()
    for a in ls:
        if ls[a] is True:
            nan_list.append(a)
    return nan_list
#Filling Mean Values in Dataframe
def fill_mean(df,nanval):
    diffs = []
    means = []
    for column in nanval:
        diffs = df[column].unique().tolist()
        diffs = [x for x in diffs if str(x) != 'nan']
        maxnum = df[column].value_counts()[diffs[0]]
        maxval = diffs[0]
        for val in diffs:
            if maxnum < df[column].value_counts()[val]:
                maxnum = df[column].value_counts()[val]
                maxval = val
        df[column].fillna(maxval,inplace=True)
        means.append(maxval)
    
    return df , means

In [None]:
#Reading Train.csv
df_train = pd.read_csv('kkbox-music-recommendation-challenge/train.csv',index_col=False)

In [None]:
#Reading Members.csv(User Personal Info)
df_members = pd.read_csv('kkbox-music-recommendation-challenge/members.csv',index_col=False)

In [None]:
#Reading Songs.csv(Song Details)
df_songs = pd.read_csv('kkbox-music-recommendation-challenge/songs.csv',index_col=False)

In [None]:
#Reading Songs_extra_info.csv (Extra Details about Songs)
df_extra = pd.read_csv('kkbox-music-recommendation-challenge/song_extra_info.csv',index_col=False)

In [None]:
#Reading Test.csv
df_test = pd.read_csv('kkbox-music-recommendation-challenge/test.csv',index_col=False)

In [None]:
#Finding NAN values in train and replacing them with mean
nan_train = find_nan(df_train)
df_train , train_na = fill_mean(df_train,nan_train)
#Same mean values have been set in test.csv
df_test['source_system_tab'].fillna(train_na[0],inplace=True)
df_test['source_screen_name'].fillna(train_na[1],inplace=True)
df_test['source_type'].fillna(train_na[2],inplace=True)

In [None]:
#Merging of Training Data with Songs Data
train_song_mer = df_train.merge(df_songs,on='song_id',how='left')
#Merging of Testing Data with Songs Data
test_song_mer = df_test.merge(df_songs,on='song_id',how='left')

In [None]:
#Merging of Training Data with Members Data
tr_so_mm_mer = train_song_mer.merge(df_members,on='msno',how='left')
#Merging of Testing Data with Members Data
te_so_mm_mer = test_song_mer.merge(df_members,on='msno',how='left')

In [None]:
#Merging of Training Data with Songs Extra Info 
train_final = tr_so_mm_mer.merge(df_extra,on='song_id',how='left')
#Merging of Testing Data with Songs Extra Info
test_final = te_so_mm_mer.merge(df_extra,on='song_id',how='left')

In [None]:
#Filling NAN values with Unknown 
train_final['composer'].fillna('unknown_comp',inplace=True)
train_final['lyricist'].fillna('unknown_lyrc',inplace=True)
train_final['artist_name'].fillna('unknown_art',inplace=True)
train_final['name'].fillna('unknown_name',inplace=True)
train_final['isrc'].fillna('unknown_isrc',inplace=True)
train_final['genre_ids'].fillna('465',inplace=True)
test_final['composer'].fillna('unknown_comp',inplace=True)
test_final['lyricist'].fillna('unknown_lyrc',inplace=True)
test_final['artist_name'].fillna('unknown_art',inplace=True)
test_final['name'].fillna('unknown_name',inplace=True)
test_final['isrc'].fillna('unknown_isrc',inplace=True)
test_final['genre_ids'].fillna('465',inplace=True)

In [None]:
#Filling NAN in song_length with mean of average song length
son_len = train_final['song_length'].tolist()
son_len = [x for x in son_len if str(x) != 'nan']
avg_son = int(sum(son_len)/len(son_len))
train_final['song_length'].fillna(avg_son,inplace=True)
test_final['song_length'].fillna(avg_son,inplace=True)

In [None]:
#Filling Remaining NAN values with Mean
final_nan = find_nan(train_final)
test_nan = find_nan(test_final)
train_final, train_na = fill_mean(train_final,final_nan)
test_final['language'].fillna(train_na[0],inplace=True)
test_final['gender'].fillna(train_na[1],inplace=True)

In [None]:
#Setting Data Types of columns
obj_col= train_final.columns.tolist()
for col in obj_col:
    if train_final[col].dtype == 'object':
        train_final[col] = train_final[col].astype('category')
obj_col= test_final.columns.tolist()
for col in obj_col:
    if test_final[col].dtype == 'object':
        test_final[col] = test_final[col].astype('category')
        

In [None]:
#One hot encoding of genre_ids
train_final = pd.concat([train_final, pd.get_dummies(train_final['genre_ids'], drop_first = True)], axis = 1)
train_final.drop('genre_ids', axis = 1, inplace = True)

In [None]:
end_columns = train_final.columns.tolist()
gen_end = []
for col in end_columns:
    if train_final[col].dtype == 'uint8':
        gen_end.append(col)
diff_gens_train = []
for col in gen_end:
    val = [x.strip() for x in col.split("|")]
    for v in val:
        diff_gens_train.append(v)
        diff_gens_train = list(set(diff_gens_train))
train_end = train_final
done_gens = []
for col in gen_end:
    if col in diff_gens_train:
        pass
    else:
        gen_split = [x.strip() for x in col.split("|")]
        for gen in gen_split:
            if gen in gen_end:
                train_end[gen] = train_end[col]+train_end[gen]
                print(gen)
            else:
                train_end[gen] = train_end[col]
                print(gen)
        del train_end[col]

In [None]:
test_back = test_final
test_final = pd.concat([test_final, pd.get_dummies(test_final['genre_ids'], drop_first = True)], axis = 1)
test_final.drop('genre_ids', axis = 1, inplace = True)
end_columns = test_final.columns.tolist()
print(len(end_columns))
gen_test = []
for col in end_columns:
    if test_final[col].dtype == 'uint8':
        gen_test.append(col)
        print(col)

In [None]:
diff_gens_test = []
for col in gen_test:
    val = [x.strip() for x in col.split("|")]
    for v in val:
        diff_gens_test.append(v)
        diff_gens_test = list(set(diff_gens_test))

In [None]:
end_columns = test_final.columns.tolist()
for col in gen_test:
    if col in diff_gens_test:
        pass
    else:
        val = [x.strip() for x in col.split("|")]
        for v in val:
            if v in end_columns:
                test_final[v] = test_final[col]+test_final[v]
                print(v)
            else:
                test_final[v] = test_final[col]
                print(v)
        del test_final[col]

In [None]:
tr_co = train_end.columns.tolist()
te_co = test_final.columns.tolist()
left = []
for col in tr_co:
    if col in te_co:
        pass
    else:
        left.append(col)
        print(col)
right = []
for col in te_co:
    if col in tr_co:
        pass
    else:
        right.append(col)
        print(col)
for col in left:
    if col!='target':
        test_final[col] = 0
        test_final[col] = test_final[col].astype('uint8')
for col in right:
    if col!='id':
        train_end[col] = 0
        train_end[col] = train_end[col].astype('uint8')

In [None]:
#Finding genres with less than 10 entries in data and deleting them
train_column = train_end.columns.tolist()
test_column = test_final.columns.tolist()
gens = []
for col in train_column:
    if train_end[col].dtype == 'uint8':
        gens.append(col)
for col in test_column:
    if test_final[col].dtype == 'uint8':
        gens.append(col)

gen_count = {}
for col in gens:
    temp = int(sum(train_end[col]))+int(sum(test_final[col]))
    gen_count[col] = temp
    
mini = gen_count["465"]
maxi = gen_count["465"]
avg = 0
count = 0
for col in gen_count:
    if gen_count[col]<mini:
        mini = gen_count[col]
    if gen_count[col]>maxi:
        maxi = gen_count[col]
    avg = avg + gen_count[col]
    count+=1
avg = int(avg/count)
print(maxi)
print(mini)
print(avg)

avg_big = 0
for col in gen_count:
    if gen_count[col]<avg:
        avg_big+=1
print(avg_big)
print(count)

navg = 0
ncount = 0
for col in gen_count:
    if gen_count[col]<avg:
        navg = navg + gen_count[col]
        ncount+=1
navg = int(navg/ncount)
print(navg)

s_100 = 0
for col in gen_count:
    if gen_count[col]<10:
        s_100+=1
print(s_100)
print(count)

In [None]:
for col in gen_count:
    if gen_count[col]<10:
        del train_end[col]
        del test_final[col]
        print(col)

In [None]:
#Setting X-Train and Y-Train
X_train = train_end.drop(['target'], axis=1)
y_train = train_end['target'].values


X_test = test_final.drop(['id'], axis=1)
ids = test_final['id'].values

d_train_final = lgb.Dataset(X_train, y_train)
watchlist_final = lgb.Dataset(X_train, y_train)

In [None]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

In [None]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'dart',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 108,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 10,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f2 = lgb.train(params, train_set=d_train_final,  valid_sets=watchlist_final, verbose_eval=5)

In [None]:
#Making Predictions and

p_test_1 = model_f1.predict(X_test)
p_test_2 = model_f2.predict(X_test)
p_test_avg = np.mean([p_test_1, p_test_2], axis = 0)


In [None]:
#Creating Submission Files
subm = pd.DataFrame()
subm['id'] = ids
subm['target'] = p_test_avg
subm.to_csv(data_path + 'submission_lgbm_avg.csv.gz', compression = 'gzip', index=False, float_format = '%.5f')