In [148]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/prediction-of-music-genre/music_genre.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [149]:
#IMPORT LIBRARIES

# linear algebra
import numpy as np 

# data processing
import pandas as pd 
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# data visualization
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
from sklearn.metrics import ConfusionMatrixDisplay

In [150]:
DF = pd.read_csv('/kaggle/input/prediction-of-music-genre/music_genre.csv')

df = DF.copy()

In [151]:
df.head()

In [152]:
df.info()

In [153]:
df.describe()

In [154]:
df.head(1)

Features:

    Categorical:

    1. Instance ID
    2. Artist name
    3. Track name
    4. Key
    5. Mode
    6. Obtained date

Thus a total of 6 columns are categorical variables.

    Numeric:

    1. Popularity
    2. Acousticness
    3. Danceability
    4. Duration (in milliseconds)
    5. Energy
    6. Instrumentalness
    7. Liveness
    8. Loudness
    9. Speechiness
    10.Tempo
    11. Valence

A total of 4 columns are numeric.

    Target:
    
    Music Genre




In [118]:
#EXPLORATORY DATA ANALYSIS

# EDA : Histogram - for continuous variables

df_cont = df[['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence']]

def draw_countplots(df, variables, n_rows, n_cols):
    fig=plt.figure(figsize=(20,12))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        sns.set_style('darkgrid')
        sns.histplot(data = df, x = df[var_name], bins = 20, color = 'indigo')
        ax.set_title(var_name + " Histogram")
        plt.ylabel("Count")
    fig.tight_layout()
    plt.show()

draw_countplots(df_cont, df_cont.columns, 4, 4)

In [119]:
# EDA: Box plot - for continuous variables

fig, ax = plt.subplots(ncols = 5, nrows = 2, figsize = (20, 10))
index = 0
ax = ax.flatten()

for col, value in df_cont.items():
    sns.boxplot(y = col, data = df_cont, ax=ax[index], palette = 'Greens')
    index += 1
    #ax.set_title(var_name + " Boxplot")
plt.tight_layout(pad = 1, w_pad=0.7, h_pad=5.0)

In [120]:
#EDA : Violin plot - for continuous variables

fig, ax = plt.subplots(ncols = 5, nrows = 2, figsize = (20, 10))
index = 0
ax = ax.flatten()

for col, value in df_cont.items():
    sns.violinplot(y=col, data=df_cont, ax=ax[index], palette = 'Purples')
    index += 1
plt.tight_layout(pad = 1, w_pad=0.7, h_pad=5.0)

In [155]:
#DATA CLEANING

#code to find out missing values in terms of percentage
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(10)

Since the above table lists only 5 NA values we can locate and delete them from the dataset

In [156]:
duplicates = df.duplicated()
df[duplicates].head(10)

The rows 10001 to 10004 would be removed since they contain only NaN values

In [157]:
df.drop([10000, 10001, 10002, 10003, 10004], inplace = True)

Now, we can recheck for missing values :

In [158]:
#Recheck for missing values
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(10)

Now, we have taken care of missing values. We can also drop the following rows since they dont give any information -
1. obtained_date
2. instance id
3. track name


In [160]:
df = df.drop(["instance_id", "track_name", "obtained_date"], axis = 1)

We can now check for entries in music genre and artist name columns to check if the values are unique or not

In [161]:
genre_unique = df["music_genre"].value_counts().sort_values(ascending = False)

In [162]:
genre_unique

In [163]:
artists_unique = df["artist_name"].value_counts().sort_values(ascending = False)

In [164]:
artists_unique

As we can see, music genre has proper unique values. However, there are a lot of empty fields in artist unique values. We need to drop these fields so that
our model could function reasonably.

In [165]:
df = df.drop(df[df["artist_name"] == "empty_field"].index)

In [166]:
artists_unique = df["artist_name"].value_counts().sort_values(ascending = False)
artists_unique

After looking through the dataset, there were some question marks in certain entries of tempo column.

In [167]:
df_qm = df[df["tempo"] == "?"]
df_qm

Since we cannot randomly assign tempo values to any song based on available data, we would have to drop the rows that contains these question marks.

In [168]:
df = df.drop(df[df["tempo"] == "?"].index)

In [169]:
df_qm = df[df["tempo"] == "?"].count()
df_qm

Now, we have a clean dataset that can be used for model building and prediction. Also, we can do EDA after this cleaning so that the plots are updated.

In [170]:
# EDA : Histogram - for continuous variables

df_cont = df[['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence']]

def draw_countplots(df, variables, n_rows, n_cols):
    fig=plt.figure(figsize=(20,12))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        sns.set_style('darkgrid')
        sns.histplot(data = df, x = df[var_name], bins = 20, color = 'indigo')
        ax.set_title(var_name + " Histogram")
        plt.ylabel("Count")
    fig.tight_layout()
    plt.show()

draw_countplots(df_cont, df_cont.columns, 4, 4)

As we can see, all the histograms look similar to the ones before. Thus, we can start with preprocessing the data so that it gets ready to be used for machine learning models.

In [171]:
#DROPPING STRING COLUMNS
df = df.drop(["artist_name"], axis = 1)

In order to obtain numeric values, we can use label encoding for both mode as well as key column.

In [176]:
#ONE HOT ENCODING
from sklearn.preprocessing import LabelEncoder

df["mode"] = LabelEncoder().fit_transform(df['mode'])
df["key"] = LabelEncoder().fit_transform(df["key"])

In [177]:
df.head()

In [219]:
#IMPORT LIBRARIES FOR PREPROCESSING AND MODEL BUILDING
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [178]:
#SPLIT FEATURES AND TARGETS
df_features = df.drop("music_genre", axis = 1)
df_target = df["music_genre"]


In [179]:
#SCALING
scaler = StandardScaler()
df_features_scaled = scaler.fit_transform(df_features)

In [182]:
#SPLITTING DATA INTO TRAIN, TEST AND VALIDATION SET
from sklearn.model_selection import train_test_split

tr_val_f, test_features, tr_val_l, test_labels = train_test_split(
    df_features_scaled, df_target, test_size = 0.1, stratify = df_target)

train_features, val_features, train_labels, val_labels = train_test_split(
    tr_val_f, tr_val_l, test_size = len(test_labels), stratify = tr_val_l)

train_features.shape, train_labels.shape, val_features.shape, val_labels.shape, test_features.shape,   test_labels.shape

We would be using **f1-score** instead of accuracy here since it is much better when handling with a bigger class distribution.

In [218]:
#MODELLING
from sklearn.metrics import make_scorer, f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score

f1 = make_scorer(f1_score, average = "weighted")

#Declaring parameters for hyperparameter tuning
params = {
    "n_estimators": [10, 15, 20, 25, 30, 35],
    "max_depth": [5, 10, 15, 20, 25],
    "min_samples_leaf": [1, 2, 3, 4, 5]
}

In [206]:
grid_search = GridSearchCV(RandomForestClassifier(), param_grid = params, scoring = f1, cv = 5)

In [207]:
grid_search.fit(train_features, train_labels)

In [208]:
grid_search.best_params_

In [238]:
model1 = RandomForestClassifier(max_depth = 15, min_samples_leaf = 5, n_estimators = 35)

model1.fit(train_features, train_labels)

In [239]:
def classification_task(estimator, features, labels):
   
    predictions = estimator.predict(features)
    
    print(f"Accuracy: {accuracy_score(labels, predictions)}")
    print(f"F1 score: {f1_score(labels, predictions, average = 'weighted')}")

In [240]:
classification_task(model1, train_features, train_labels)

In [241]:
classification_task(model1, val_features, val_labels)

In [242]:
classification_task(model1, test_features, test_labels)

In [276]:
print("Random Forest Classifier:\n")
print(classification_report(test_labels, model1.predict(test_features)))

In [281]:
plt.figure(figsize = (10, 10))
sns.heatmap(confusion_matrix(test_labels, model1.predict(test_features)),
    annot = True,
    fmt = ".0f",
    cmap = "BuPu",
    linewidths = 2,
    linecolor = "white",
    xticklabels = model.classes_,
    yticklabels = model.classes_)
plt.title("Actual values")
plt.ylabel("Predicted values")
plt.tight_layout()
plt.show()

XGBoost classifier:

In [245]:
from xgboost import XGBClassifier

model2 = XGBClassifier()

In [246]:
model2

In [247]:
#Converting to label encoders
le = LabelEncoder()
e_train_labels, e_val_labels, e_test_labels = le.fit_transform(train_labels), le.fit_transform(val_labels), le.fit_transform(test_labels)

In [248]:
model2.fit(train_features, e_train_labels)

In [249]:
classification_task(model2, train_features, e_train_labels)

In [250]:
classification_task(model2, val_features, e_val_labels)

In [251]:
classification_task(model2, test_features, e_test_labels)

In [267]:
model3 = XGBClassifier(n_estimators=100, max_depth=15, learning_rate=0.2, subsample=0.5)

In [268]:
model3.fit(train_features, e_train_labels)

In [269]:
classification_task(model3, train_features, e_train_labels)

In [270]:
classification_task(model3, val_features, e_val_labels)

In [271]:
classification_task(model3, test_features, e_test_labels)

In [274]:
print('XGBoost report:')
print(classification_report(test_labels, model1.predict(test_features)))

In [284]:
plt.figure(figsize = (10, 10))
sns.heatmap(confusion_matrix(test_labels, model1.predict(test_features)),
    annot = True,
    fmt = ".0f",
    cmap = "BuPu",
    linewidths = 2,
    linecolor = "white",
    xticklabels = model.classes_,
    yticklabels = model.classes_)
plt.title("Actual values")
plt.ylabel("Predicted values")
plt.tight_layout()
plt.show()

In [295]:
#Random forest AUC Score
predicted_labels_01 = model1.predict_proba(test_features)
roc_auc_score(test_labels, predicted_labels_01, multi_class = "ovr")

In [296]:
#Default XGBoost AUC Score
predicted_labels_02 = model2.predict_proba(test_features)
roc_auc_score(test_labels, predicted_labels_02, multi_class = "ovr")

In [301]:
#Changed parameters XGBoost AUC Score
predicted_labels_03 = model3.predict_proba(test_features)
roc_auc_score(test_labels, predicted_labels_03, multi_class = "ovr")

In [329]:
import scikitplot as skplt

#model1 = randomforest, model2 = default XGBoost, model3 = Specific XGBoost
models = [model1, model2, model3]

for m in models:
    predicted_labels = m.predict_proba(test_features)
    roc_auc_score(test_labels, predicted_labels, multi_class = "ovr")
    skplt.metrics.plot_roc(test_labels, predicted_labels_03)
    index += 1
    plt.tight_layout(pad = 1, w_pad=0.7, h_pad=5.0)
    plt.show()

Thus, Default XGBoost proved to be a good model due to **93%** AUC score. This indicates that XGBoost is a little better than random forest for this dataset.

Further improvements could be done to this model by using hyperparameter tuning for XGBoost along with K Fold Validation.

In [252]:
#parameters for hyperparameter tuning
params_xgboost = {
    "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] }