##   Gender Finder Model
### This is a model for a genre finder given a number of quantitative values relating to a song file's musical attributes
### Used for a previous kaggle competition

In [5]:
import numpy as np
import pandas as pd
import sklearn as skl
import os
from feature_engine.creation import CyclicalFeatures
from sklearn.model_selection import train_test_split as tts
#   loading dataframe
path = os.getcwd()
os.chdir(path+'/Data/training-data')
csv_files = [f for f in os.listdir() if f.endswith('.csv')]
dfs = []
for csv in csv_files:
    df = pd.read_csv(csv)
    dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
os.chdir(path)

In [6]:
##   data cleaning
#   remove unneeded features
remove_cols = ['artist_name','track_name', 'track_id']
df = df.drop(columns=remove_cols)

#   fix tempo
df['tempo'] = df['tempo'].apply(pd.to_numeric, errors='coerce')
df = df.dropna()

# fix time_signature
df.insert(len(df.columns),'time_0/4',0.0)
df['time_signature'] = df['time_signature'].apply(lambda x: x.replace('-Apr', '/4'))
time_encoded = pd.get_dummies(df['time_signature'], prefix='time', dtype=float)
df = pd.concat([df, time_encoded], axis=1)
df = df.drop(columns='time_signature')

#   combine 'key' and 'mode'
df['key'] = df['key'].astype(str)+' '+df['mode']
df = df.drop(columns='mode')

#   cyclical encoding of key
keys = ['C# Major', 'A# Minor', 'F# Major', 'D# Minor', 'B Major', 'G# Minor', 'E Major', 'C# Minor', 'A Major', 'F# Minor', 'D Major', 'B Minor', 'G Major', 'E Minor',
            'C Major', 'B# Major', 'A Minor', 'F Major', 'E# Major',  'D Minor', 'A# Major',  'G Minor', 'B# Minor', 'D# Major', 'C Minor', 'E# Minor', 'F Minor', 'G# Major']
weights = [7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 0, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8]
df['key'].replace(keys, weights, inplace=True)
cyc = CyclicalFeatures(variables=None, drop_original=True)
x = cyc.fit_transform(pd.DataFrame(df['key']))
df = pd.concat([df,x], axis=1)
df = df.drop(columns='key')

#   split into features/labels
labels = df['genre']
features = df.drop(columns='genre')


#   split data into training and test sets
train_x, test_x, train_y, test_y = tts(features, labels, test_size=0.2)

In [7]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=-1, strategy='median')
dura_imp = imp.fit_transform(pd.DataFrame(train_x['duration_ms']))
train_x['duration_ms'] = dura_imp

dura_imp_test = imp.fit_transform(pd.DataFrame(test_x['duration_ms']))
test_x['duration_ms'] = dura_imp_test

In [9]:
from sklearn import ensemble as ens
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#   build and train model
feats = train_x.iloc[:,1:]
#   encode labels
enc = preprocessing.LabelEncoder()
enc.fit(train_y.unique())
newlabels = enc.transform(train_y)

#scaled = preprocessing.RobustScaler().fit_transform(feats)

#   build pipeline
xgb = XGBClassifier(eta=0.1, max_depth=9, min_child_weight=10, alpha=0.1)
model = xgb.fit(feats, newlabels)

In [None]:
#   test model
testfeats = test_x.iloc[:,1:]
enc.fit(test_y)
testlabels = enc.transform(test_y)
score = model.score(testfeats, testlabels)
score

In [16]:
#   kaggle  predictor
path = os.getcwd()
os.chdir(path+'/Data/test-data')
csv_files = [f for f in os.listdir() if f.endswith('.csv')]
dfs = []
for csv in csv_files:
    df = pd.read_csv(csv)
    dfs.append(df)
kag_data = pd.concat(dfs, ignore_index=True)
os.chdir(path)

In [17]:
##   data cleaning
#   remove unneeded features
remove_cols = ['artist_name', 'track_name', 'track_id',]
kag_clean = kag_data.drop(columns=remove_cols)

#   fix time_signature
kag_clean['time_signature'] = kag_clean['time_signature'].apply(lambda x: x.replace('-Apr', '/4'))

#   combine 'key' and 'mode'
kag_clean['key'] = kag_clean['key'].astype(str)+' '+kag_clean['mode']
kag_clean = kag_clean.drop(columns='mode')

#   convert tempo to float
kag_clean['tempo'] = kag_clean['tempo'].apply(pd.to_numeric, errors='coerce')

#   onehot encoding time signature
time_encoded = pd.get_dummies(kag_clean['time_signature'], prefix='time', dtype=float)
kag_clean = pd.concat([kag_clean, time_encoded], axis=1)
kag_clean = kag_clean.drop(columns='time_signature')


#   cyclical encoding of key
keys = ['C# Major', 'A# Minor', 'F# Major', 'D# Minor', 'B Major', 'G# Minor', 'E Major', 'C# Minor', 'A Major', 'F# Minor', 'D Major', 'B Minor', 'G Major', 'E Minor',
            'C Major', 'B# Major', 'A Minor', 'F Major', 'E# Major',  'D Minor', 'A# Major',  'G Minor', 'B# Minor', 'D# Major', 'C Minor', 'E# Minor', 'F Minor', 'G# Major']
weights = [7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0, 0, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8]
kag_clean['key'].replace(keys, weights, inplace=True)
kag_clean['key'].replace(keys, weights, inplace=True)
cyc = CyclicalFeatures(variables=None, drop_original=True)
x = cyc.fit_transform(pd.DataFrame(kag_clean['key']))
kag_clean = pd.concat([kag_clean,x], axis=1)
kag_clean = kag_clean.drop(columns='key')

In [18]:
from sklearn.impute import KNNImputer
imputer2 = SimpleImputer(missing_values=-1.0, strategy='median')
imputer3 = KNNImputer(n_neighbors=3)
temp = pd.DataFrame(kag_clean['tempo'])
t_impute = imputer3.fit_transform(temp)
kag_clean['tempo'] = t_impute
dura = pd.DataFrame(kag_clean['duration_ms'])
impute = imputer2.fit_transform(dura)
kag_clean['duration_ms'] = impute

In [19]:
#   make predictions
inst = pd.DataFrame(kag_clean.iloc[:,0], columns=['instance_id'])
kag_feat = kag_clean.iloc[:,1:]
enc = preprocessing.LabelEncoder()
enc.fit(train_y.unique())
kag_scale = preprocessing.RobustScaler().fit_transform(kag_feat)
pred_labels = model.predict(kag_feat)
strlabels = enc.inverse_transform(pred_labels)
kag_labels = pd.DataFrame(strlabels, columns=['genre'])

In [20]:
inst['genre'] = strlabels
inst
inst.to_csv("kaggle-final.csv",index=False)