In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from rfpimp import permutation_importances
import optuna
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from category_encoders.cat_boost import CatBoostEncoder
from feature_engine.outliers import Winsorizer



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('submission.csv')

In [3]:
df = pd.concat([train, test], axis= 0)

In [4]:
df.Popularity = df.Popularity.fillna(value=df.Popularity.median(), axis=0)
df.key = df.key.fillna(value=df.key.median(), axis=0)
df.instrumentalness = df.instrumentalness.fillna(value=df.instrumentalness.median(), axis=0)

In [5]:
df['popularity_by_artist_max']= df.groupby(['Artist Name'])['Popularity'].transform('max')
df['dance_by_artist_mean']= df.groupby(['Artist Name'])['danceability'].transform('mean')
df['energy_by_artist_mean']= df.groupby(['Artist Name'])['energy'].transform('mean')
df['key_by_artist_mean']= df.groupby(['Artist Name'])['key'].transform('mean')
df['loud_by_artist_mean']= df.groupby(['Artist Name'])['loudness'].transform('mean')
df['speech_by_artist_mean']= df.groupby(['Artist Name'])['speechiness'].transform('mean')
df['acostic_by_artist_mean']= df.groupby(['Artist Name'])['acousticness'].transform('mean')
df['instrument_by_artist_mean']= df.groupby(['Artist Name'])['instrumentalness'].transform('mean')
df['live_by_artist_mean']= df.groupby(['Artist Name'])['liveness'].transform('mean')
df['valence_by_artist_mean']= df.groupby(['Artist Name'])['valence'].transform('mean')
df['tempo_by_artist_mean']= df.groupby(['Artist Name'])['tempo'].transform('mean')
df['duration_by_artist_mean']= df.groupby(['Artist Name'])['duration_in min/ms'].transform('mean')
df['time_by_artist_mean']= df.groupby(['Artist Name'])['time_signature'].transform('mean')

In [7]:
df['song_in_min']= df['duration_in min/ms']/60000

In [8]:
df['pops']= df.Popularity + df.danceability + df.energy  + df.loudness + df.speechiness + df.acousticness+ df.instrumentalness + df.liveness + df.valence + df.tempo

In [9]:
df['song_by_artist_mean']= df.groupby(['Artist Name'])['song_in_min'].transform('mean')
df['pops_by_artist_mean']= df.groupby(['Artist Name'])['pops'].transform('mean')

In [10]:
df['popularity_by_singer_and_track_mean']= df.groupby(['Artist Name', 'Track Name'])['Popularity'].transform('mean')

In [11]:
keys= pd.get_dummies(df.key, drop_first=True, prefix= 'key_')
df = df.drop('key', axis= 1)
df= pd.concat([df,keys], axis=1)

In [12]:
times= pd.get_dummies(df.time_signature, drop_first=True, prefix= 'time_')
df = df.drop('time_signature', axis= 1)
df= pd.concat([df,times], axis=1)

In [13]:
df['artist_Trck']= df['Artist Name'].astype(str) + '_' + df['Track Name'].astype(str)

In [None]:
[var for var in df.columns if df[var].dtypes == 'O']

In [14]:
df['tracks_by_artists']= df.groupby('Artist Name')['Track Name'].transform('count')

In [15]:
train = df[:17996]
test = df[17996:]

In [17]:
[var for var in train.columns if train[var].dtypes== 'O']

['Artist Name', 'Track Name', 'artist_Trck']

In [18]:
cat_cols = ['Artist Name', 'Track Name', 'artist_Trck']

In [19]:
[var for var in train.columns if train[var].dtypes!= 'O']

['Popularity',
 'danceability',
 'energy',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_in min/ms',
 'Class',
 'popularity_by_artist_max',
 'dance_by_artist_mean',
 'energy_by_artist_mean',
 'key_by_artist_mean',
 'loud_by_artist_mean',
 'speech_by_artist_mean',
 'acostic_by_artist_mean',
 'instrument_by_artist_mean',
 'live_by_artist_mean',
 'valence_by_artist_mean',
 'tempo_by_artist_mean',
 'duration_by_artist_mean',
 'time_by_artist_mean',
 'song_in_min',
 'pops',
 'song_by_artist_mean',
 'pops_by_artist_mean',
 'popularity_by_singer_and_track_mean',
 'key__2.0',
 'key__3.0',
 'key__4.0',
 'key__5.0',
 'key__6.0',
 'key__7.0',
 'key__8.0',
 'key__9.0',
 'key__10.0',
 'key__11.0',
 'time__1',
 'time__3',
 'time__4',
 'time__5',
 'tracks_by_artists']

In [32]:
cont_cols = ['Popularity',
 'danceability',
 'energy',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_in min/ms',
 'popularity_by_artist_max',
 'dance_by_artist_mean',
 'energy_by_artist_mean',
 'key_by_artist_mean',
 'loud_by_artist_mean',
 'speech_by_artist_mean',
 'acostic_by_artist_mean',
 'instrument_by_artist_mean',
 'live_by_artist_mean',
 'valence_by_artist_mean',
 'tempo_by_artist_mean',
 'duration_by_artist_mean',
 'time_by_artist_mean',
 'song_in_min',
 'pops',
 'song_by_artist_mean',
 'pops_by_artist_mean',
 'popularity_by_singer_and_track_mean',
 'key__2.0',
 'key__3.0',
 'key__4.0',
 'key__5.0',
 'key__6.0',
 'key__7.0',
 'key__8.0',
 'key__9.0',
 'key__10.0',
 'key__11.0',
 'time__1',
 'time__3',
 'time__4',
 'time__5',
 'tracks_by_artists']

In [36]:
X_train = train[cat_cols+cont_cols]
y_train = train.Class
X_test = test[cat_cols+cont_cols]

In [37]:
n_folds = 10
subbed = []
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=10)


for fold, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):
    print('=============== Fold No:',fold+1,'===============')
    X_tr, X_tst = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_tst = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    model = CatBoostClassifier(n_estimators=10000, random_state=10, eval_metric= 'MultiClass', cat_features=cat_cols)
    model.fit(X_tr, y_tr,eval_set=[(X_tst, y_tst)], early_stopping_rounds=30, verbose=500)
    print(log_loss(y_tst, model.predict_proba(X_tst)))
    subbed.append(log_loss(y_tst, model.predict_proba(X_tst)))
    pred = model.predict_proba(X_test)
print(np.mean(subbed))
#0.7796464371342455
#0.776109144116776
#

Learning rate set to 0.047371
0:	learn: 2.2939349	test: 2.2937571	best: 2.2937571 (0)	total: 377ms	remaining: 1h 2m 48s
500:	learn: 0.7481398	test: 0.7758466	best: 0.7758466 (500)	total: 3m 29s	remaining: 1h 6m 3s
1000:	learn: 0.6219397	test: 0.7507902	best: 0.7507902 (1000)	total: 7m 12s	remaining: 1h 4m 44s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.7441122221
bestIteration = 1334

Shrink model to first 1335 iterations.
0.7441122220949107
Learning rate set to 0.047371
0:	learn: 2.2939603	test: 2.2947312	best: 2.2947312 (0)	total: 496ms	remaining: 1h 22m 42s
500:	learn: 0.7445404	test: 0.7692913	best: 0.7692231 (499)	total: 3m 31s	remaining: 1h 6m 51s
1000:	learn: 0.6163539	test: 0.7424348	best: 0.7424348 (1000)	total: 7m 6s	remaining: 1h 3m 56s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.7412702601
bestIteration = 1027

Shrink model to first 1028 iterations.
0.7412702601307348
Learning rate set to 0.047371
0:	learn: 2.2944790	test: 2.2

In [38]:
out = pd.DataFrame(pred)
out

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.000482,0.541627,0.012778,0.000032,0.000060,0.000782,0.116931,0.000047,0.002941,0.016747,0.307574
1,0.000031,0.005524,0.000881,0.000003,0.000007,0.001310,0.018118,0.000003,0.806412,0.002381,0.165329
2,0.000020,0.004224,0.000139,0.000005,0.000126,0.015953,0.006832,0.000003,0.001910,0.924317,0.046471
3,0.000211,0.008172,0.000328,0.000034,0.000336,0.022143,0.014413,0.000012,0.002147,0.868666,0.083538
4,0.002031,0.000234,0.000092,0.000123,0.010005,0.000638,0.000330,0.000025,0.000242,0.981666,0.004614
...,...,...,...,...,...,...,...,...,...,...,...
7708,0.000011,0.001146,0.002392,0.000005,0.000012,0.004213,0.003305,0.000002,0.000203,0.978861,0.009850
7709,0.000017,0.079585,0.034459,0.000005,0.000004,0.002520,0.133962,0.000028,0.365121,0.008791,0.375509
7710,0.116189,0.015160,0.003944,0.242357,0.015247,0.039679,0.029819,0.003744,0.000973,0.465906,0.066982
7711,0.000114,0.122930,0.041208,0.000017,0.000046,0.004730,0.341832,0.000016,0.000595,0.015946,0.472567


In [39]:
out = out.rename(columns={0:'Acoustic/Folk_0',
               1:'Alt_Music_1',
               2:'Blues_2',
               3:'Bollywood_3',
               4:'Country_4',
               5:'HipHop_5',
               6:'Indie Alt_6',
               7:'Instrumental_7',
               8:'Metal_8',
               9:'Pop_9',
               10:'Rock_10'})

In [40]:
out.to_csv('out6.csv', index= False)