# **1. Import The Required Libraries**

In [None]:
# Import the required libraries and dependencies
import pandas as pd
import numpy as np
import holoviews as hv
import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.preprocessing import StandardScaler, label_binarize
import xgboost
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score, roc_curve, auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from math import sqrt
from xgboost import XGBClassifier
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import RandomOverSampler
from torch.utils.data import DataLoader, TensorDataset
from joblib import dump, load

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
warnings.filterwarnings('ignore')

# **2. Load The Data**

In [None]:
df = pd.read_csv('Spotify.csv')

# **3. Data Exploration**

## **3.1 Overview**

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.duplicated().sum()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
columns_with_nas = ['danceability', 'energy', 'key', 'mode', 'loudness', 'speechiness',
                    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                    'duration']

rows_with_nas = df.loc[df[columns_with_nas].isna().any(axis=1)]

rows_with_nas

In [None]:
spotify = df.drop(columns = ['Unnamed: 0', 'uri', 'artist_names', 'artist_img', 'artist_individual', 
                             'album_cover', 'artist_id', 'track_name', 'source', 'pivot', 'release_date', 'collab'])
spotify.dropna(inplace = True)

## **3.2 Understand Numerical Features**

In [None]:
spotify['week'] = pd.to_datetime(spotify['week'], errors = 'coerce')
spotify['week_of_year'] = spotify['week'].dt.isocalendar().week

In [None]:
numeric_columns = [
    'rank', 'artists_num', 'album_num_tracks', 'peak_rank',
    'previous_rank', 'weeks_on_chart', 'streams', 'danceability',
    'energy', 'key', 'mode', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence',
    'tempo', 'duration', 'week_of_year'
]

# Convert each specified column to numeric, coercing errors to NaN
for column in numeric_columns:
    spotify[column] = pd.to_numeric(spotify[column], errors = 'coerce')

In [None]:
spotify.describe(percentiles = [0.1, 0.25, 0.5, 0.75, 0.95])

In [None]:
corr = spotify[numeric_columns].corr()

sns.heatmap(corr, cmap = "YlGnBu")

In [None]:
corr

In [None]:
spotify['streams'].hvplot.hist(bins = 10000000, title = 'Streams Distribution').opts(xformatter = '%.0f', yformatter = '%.0f')

## **3.3 Understand Categorical Features**

### **3.3.1 Country**

In [None]:
plt.figure(figsize=(20, 10))
sns.stripplot(y = 'country', x = 'streams', data = spotify, hue = 'country', jitter = True, legend = False)
plt.show()
spotify['country'].value_counts()

### **3.3.2 Language**

In [None]:
plt.figure(figsize=(20, 10))
sns.stripplot(y = 'language', x = 'streams', data = spotify, hue = 'language', jitter = True, legend = False)
plt.show()
spotify['language'].value_counts()

### **3.3.3 Region**

In [None]:
plt.figure(figsize=(10, 5))
sns.stripplot(y = 'region', x = 'streams', data = spotify, hue = 'region', jitter = True, legend = False)
plt.show()
spotify['region'].value_counts()

### **3.3.4 Artist_genre**

In [None]:
plt.figure(figsize=(50, 10))
sns.stripplot(y = 'artist_genre', x = 'streams', data = spotify, hue = 'artist_genre', jitter = True, legend = False)
plt.show()
spotify['artist_genre'].value_counts()

In [None]:
spotify_filter = spotify.loc[(spotify['artist_genre'] != '0') & (spotify['country'] != 'Global') & (spotify['language'] != 'Global')]

In [None]:
country_top10 = spotify_filter.groupby('country')['streams'].sum().sort_values(ascending = False)
top10_country = country_top10.index[:10]
top10_country = top10_country.tolist()
top10_country

In [None]:
group_of_streams = spotify_filter.groupby('artist_genre')['streams']
sum_of_streams = group_of_streams.sum()
genre_top10 = sum_of_streams.sort_values(ascending = False)
top10_genre = genre_top10.index[:10]
top10_genre = top10_genre.tolist()
top10_genre

In [None]:
spotify_filter = spotify_filter.loc[(spotify_filter['country'].isin(top10_country)) & 
                                    (spotify_filter['artist_genre'].isin(top10_genre))]

After very basic Exploratory Data Analysis, we have to do some data cleaning and data preprocessing. We need three steps to finish this:

1.   Encode the categorical feature.
2.   Impute the missing value for both numeric and categorical feature.
3.   Scale out feature, which can be better for our models' performance.

# **4. Feature Preprocessing**

## **4.1 Categorical Features**

In [None]:
d_country = pd.get_dummies(spotify_filter['country']).astype(np.int64)
d_language = pd.get_dummies(spotify_filter['language']).astype(np.int64)
d_region = pd.get_dummies(spotify_filter['region']).astype(np.int64)
#d_artist_genre = pd.get_dummies(spotify_filter['artist_genre']).astype(np.int64)

In [None]:
# Convert the genres into numbers 0-9

codes, unique = pd.factorize(spotify_filter['artist_genre'])
spotify_filter['genre'] = codes

In [None]:
# Create a dictionary to map encoded labels to artist genres
genre_mapping = {code: genre for code, genre in enumerate(unique)}

# Display the mapping
genre_mapping

In [None]:
# new dataframe, drop the previous categorical features, add new dummy variables, check for null
temp_spotify = spotify_filter.drop(['country', 'language', 'region', 'artist_genre', 'week'], axis = 1)
spotify_df = pd.concat([temp_spotify, d_country, d_language, d_region], axis = 1)
spotify_df.reset_index(inplace = True)
spotify_df.drop(columns = ['index'], inplace = True)
spotify_df.head()

In [None]:
spotify_df.dropna(inplace = True)

In [None]:
spotify_df.info()

## **4.2 Split The Features And Target Sets Into Training And Testing Datasets.**

In [None]:
X = spotify_df.drop(columns = ['genre'])
y = spotify_df['genre']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 1)

## **4.3 Features Scaling**

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## **4.3.1 Undersampling the Minority Class**

In [None]:
spotify_df['genre'].value_counts()

In [None]:
# Create an instance of ClusterCentroids
cc = ClusterCentroids(random_state = 1)

# Fit the cluster centroids model to the traning data
X_under_resampled, y_under_resampled = cc.fit_resample(X_train_scaled, y_train)

In [None]:
y_under_resampled.value_counts()

## **4.3.2 Oversampling the Minority Class**

In [None]:
rus = RandomOverSampler(random_state = 42)

# Fit the data to the model
X_over_resampled, y_over_resampled = rus.fit_resample(X_train_scaled, y_train)

In [None]:
y_over_resampled.value_counts()

# **5. Model Training**

## **5.1 Random Forest**

In [None]:
param_grid = {'n_estimators':[55, 60, 65], 'max_depth':[15, 18, 20]}

### **5.1.1 Undersample**

In [None]:
rf_under = RandomForestClassifier(random_state = 2, max_features = 'sqrt')
clf_under = GridSearchCV(estimator = rf_under, param_grid = param_grid, cv = 5)
clf_under.fit(X_under_resampled, y_under_resampled)

In [None]:
print('The best parameter for max_depth is: ' + str(clf_under.best_params_['max_depth']))
print('The best parameter for n_estimators is: ' + str(clf_under.best_params_['n_estimators']))

In [None]:
dump(clf_under, 'Model_Saved/rf_model_Under.joblib')

### **5.1.2 Oversample**

In [None]:
rf_over = RandomForestClassifier(random_state = 2, max_features = 'sqrt')
clf_over = GridSearchCV(estimator = rf_over, param_grid = param_grid, cv = 5)
clf_over.fit(X_over_resampled, y_over_resampled)

In [None]:
print('The best parameter for max_depth is: ' + str(clf_over.best_params_['max_depth']))
print('The best parameter for n_estimators is: ' + str(clf_over.best_params_['n_estimators']))

In [None]:
dump(clf_over, 'Model_Saved/rf_model_Over.joblib')

## **5.2 XGBooster**

In [None]:
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5], 'learning_rate': [0.01, 0.1, 0.2]}

### **5.2.1 Undersample**

In [None]:
x_train_xgb, x_valid, y_train_xgb, y_valid = train_test_split(X_under_resampled, y_under_resampled, test_size = 0.1, random_state = 42)

In [None]:
xgb_clf_under = XGBClassifier()
grid_search_under = GridSearchCV(estimator = xgb_clf_under, param_grid = param_grid, scoring = 'accuracy', cv = 5)
grid_search_under.fit(x_train_xgb, y_train_xgb)
best_params_under = grid_search_under.best_params_

In [None]:
best_params_under

In [None]:
xgb_clf_under.fit(x_train_xgb, y_train_xgb, eval_set = [(x_valid, y_valid)], verbose = True)

In [None]:
xgb_clf_under.save_model('Model_Saved/xgb_model_under.json')

### **5.2.2 Oversample**

In [None]:
x_train_xgb, x_valid, y_train_xgb, y_valid = train_test_split(X_over_resampled, y_over_resampled, test_size = 0.1, random_state = 42)

In [None]:
xgb_clf_over = XGBClassifier()
grid_search_over = GridSearchCV(estimator = xgb_clf_over, param_grid = param_grid, scoring = 'accuracy', cv = 5)
grid_search_over.fit(x_train_xgb, y_train_xgb)
best_params_over = grid_search_under.best_params_

In [None]:
best_params_over

In [None]:
xgb_clf_over.fit(x_train_xgb, y_train_xgb, eval_set = [(x_valid, y_valid)], verbose = True)

In [None]:
xgb_clf_over.save_model('Model_Saved/xgb_model_over.json')

## **5.3 PyTorch**

In [None]:
class ClassifierNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ClassifierNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        # Initialize layer1 weights using Xavier or Kaiming initialization
        init.xavier_uniform_(self.layer1.weight)  # Xavier initialization
        # OR
        # init.kaiming_uniform_(self.layer1.weight, mode='fan_in', nonlinearity='relu')  # Kaiming initialization
        
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        # Initialize layer2 weights using Xavier or Kaiming initialization
        init.xavier_uniform_(self.layer2.weight)  # Xavier initialization
        # OR
        # init.kaiming_uniform_(self.layer2.weight, mode='fan_in', nonlinearity='relu')  # Kaiming initialization
        
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.output_layer = nn.Linear(hidden_size, num_classes)
        # Initialize output_layer weights using Xavier or Kaiming initialization
        init.xavier_uniform_(self.output_layer.weight)  # Xavier initialization
        # OR
        # init.kaiming_uniform_(self.output_layer.weight, mode='fan_in', nonlinearity='relu')  # Kaiming initialization
        
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, x):
        out = self.relu(self.bn1(self.layer1(x)))
        out = self.relu(self.bn2(self.layer2(out)))
        out = self.output_layer(out)
        out = self.softmax(out)  # Apply softmax activation
        return out

In [None]:
input_size = X_over_resampled.shape[1]  
hidden_size = 20
num_classes = 10

### **5.3.1 Undersample**

In [None]:
model_under = ClassifierNN(input_size, hidden_size, num_classes)

criterion_under = nn.CrossEntropyLoss()  # Suitable for multi-class classification
optimizer_under = optim.AdamW(model_under.parameters(), lr = 0.001)

In [None]:
# Assuming X_resampled and y_resampled are numpy arrays or need to be converted from another format
X_tensor_under = torch.tensor(X_under_resampled, dtype = torch.float32)  # Ensure dtype is correct for your model
y_tensor_under = torch.tensor(y_under_resampled, dtype = torch.long)  # Use torch.float for regression targets

dataset_under = TensorDataset(X_tensor_under, y_tensor_under)  # Wrap in TensorDataset
train_loader_under = DataLoader(dataset_under, batch_size = 64, shuffle = True)

In [None]:
for epoch in range(100):  # Number of epochs
    for inputs, labels in train_loader_under:  # Correctly iterates over batches
        optimizer_under.zero_grad()
        outputs = model_under(inputs)
        loss = criterion_under(outputs, labels)
        loss.backward()
        optimizer_under.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

In [None]:
torch.save(model_under, 'Model_Saved/PyTorch_Under.pth')

### **5.3.2 Oversample**

In [None]:
model_over = ClassifierNN(input_size, hidden_size, num_classes)

criterion_over = nn.CrossEntropyLoss()  # Suitable for multi-class classification
optimizer_over = optim.AdamW(model_over.parameters(), lr = 0.001)  # Common choice of optimizer and learning rate

In [None]:
# Assuming X_resampled and y_resampled are numpy arrays or need to be converted from another format
X_tensor_over = torch.tensor(X_over_resampled, dtype = torch.float32)  # Ensure dtype is correct for your model
y_tensor_over = torch.tensor(y_over_resampled, dtype = torch.long)  # Use torch.float for regression targets

dataset_over = TensorDataset(X_tensor_over, y_tensor_over)  # Wrap in TensorDataset
train_loader_over = DataLoader(dataset_over, batch_size = 64, shuffle = True)

In [None]:
for epoch in range(100):  # Number of epochs
    for inputs, labels in train_loader_over:  # Correctly iterates over batches
        optimizer_over.zero_grad()
        outputs = model_over(inputs)
        loss = criterion_over(outputs, labels)
        loss.backward()
        optimizer_over.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

In [None]:
torch.save(model_over, 'Model_Saved/PyTorch_Over.pth')

### **5.3.3 Original**

In [None]:
model = ClassifierNN(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()  # Suitable for multi-class classification
optimizer = optim.AdamW(model.parameters(), lr = 0.001)  # Common choice of optimizer and learning rate

In [None]:
# Assuming X_resampled and y_resampled are numpy arrays or need to be converted from another format
X_tensor = torch.tensor(X_train_scaled, dtype = torch.float32)  # Ensure dtype is correct for your model
y_tensor = torch.tensor(y_train, dtype = torch.long)  # Use torch.float for regression targets

dataset = TensorDataset(X_tensor, y_tensor)  # Wrap in TensorDataset
train_loader = DataLoader(dataset, batch_size = 64, shuffle = True)

In [None]:
for epoch in range(100):  # Number of epochs
    for inputs, labels in train_loader:  # Correctly iterates over batches
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

In [None]:
torch.save(model_over, 'Model_Saved/PyTorch.pth')

# **6. Model Evaluation**

In [None]:
# Assuming y_test is not binarized, binarize it
y_test_binarized = label_binarize(y_test, classes = np.unique(y_test))

# Compute ROC curve and ROC area for each class
n_classes = y_test_binarized.shape[1]

## **6.1 Random Forest**

### **6.1.1 Undersample**

In [None]:
rf_under = RandomForestClassifier(n_estimators = clf_under.best_params_['n_estimators'], 
                                  max_depth = clf_under.best_params_['max_depth'], max_features = 'sqrt', 
                                  random_state = 2, n_jobs = -1)
rf_under.fit(X_under_resampled, y_under_resampled)
pred_y_rf_under = rf_under.predict(X_test_scaled)
rf_under_score = rf_under .score(X_test_scaled, y_test)
MSE_rf_under = mean_squared_error(y_test, pred_y_rf_under)
RMSE_rf_under = np.sqrt(MSE_rf_under)
print ('rf score: ', rf_under_score)
print ('Mean square error of rf: ', MSE_rf_under)
print ('Root mean squared error of rf:', RMSE_rf_under)

In [None]:
print(classification_report(y_test, pred_y_rf_under))

In [None]:
# Predict class probabilities
y_under_scores = rf_under.predict_proba(X_test_scaled)

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_under_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting ROC curves
plt.figure(figsize = (7, 5))
colors = ['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color = color, lw = 2,
             label = 'ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RandomForest Undersampled Multi-class ROC')
plt.legend(loc = "lower right")
plt.show()

### **6.1.2 Oversample**

In [None]:
rf_over = RandomForestClassifier(n_estimators = clf_over.best_params_['n_estimators'], 
                                 max_depth = clf_over.best_params_['max_depth'],
                                 max_features = 'sqrt', random_state = 2, n_jobs = -1)
rf_over.fit(X_over_resampled, y_over_resampled)
pred_y_rf_over = rf_over.predict(X_test_scaled)
rf_over_score = rf_over.score(X_test_scaled, y_test)
MSE_rf_over = mean_squared_error(y_test, pred_y_rf_over)
RMSE_rf_over = np.sqrt(MSE_rf_over)
print ('rf score: ', rf_over_score)
print ('Mean square error of rf: ', MSE_rf_over)
print ('Root mean squared error of rf:', RMSE_rf_over)

In [None]:
print(classification_report(y_test, pred_y_rf_over))

In [None]:
# Predict class probabilities
y_over_scores = rf_over.predict_proba(X_test_scaled)

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_over_scores[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting ROC curves
plt.figure(figsize = (7, 5))
colors = ['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color = color, lw = 2,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RandomForest Oversampled Multi-class ROC')
plt.legend(loc = "lower right")
plt.show()

## **6.2 XGBooster**

### **6.2.1 Undersample**

In [None]:
pred_y_xgb_under = xgb_clf_under.predict(X_test_scaled)

print(classification_report(y_test, pred_y_xgb_under))

In [None]:
xgb_score_under = xgb_clf_under.score(X_test_scaled, y_test)
MSE_xgb_under = mean_squared_error(y_test, pred_y_xgb_under)
RMSE_xgb_under = np.sqrt(MSE_xgb_under)
print ('xgb score: ', xgb_score_under)
print ('Mean square error of xgb: ', MSE_xgb_under)
print ('Root mean squared error of xgb:', RMSE_xgb_under)

In [None]:
# Predict class probabilities
y_prob_under = xgb_clf_under.predict_proba(X_test_scaled)

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob_under[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize = (7, 5))
colors = iter(plt.cm.rainbow(np.linspace(0, 1, n_classes)))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=next(colors), lw = 2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw = 2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBooster Undersampled Multi-class ROC')
plt.legend(loc = "lower right")
plt.show()

### **6.2.2 Oversample**

In [None]:
pred_y_xgb_over = xgb_clf_over.predict(X_test_scaled)

print(classification_report(y_test, pred_y_xgb_over))

In [None]:
xgb_score_over = xgb_clf_over.score(X_test_scaled, y_test)
MSE_xgb_over = mean_squared_error(y_test, pred_y_xgb_over)
RMSE_xgb_over = np.sqrt(MSE_xgb_over)
print ('xgb score: ', xgb_score_over)
print ('Mean square error of xgb: ', MSE_xgb_over)
print ('Root mean squared error of xgb:', RMSE_xgb_over)

In [None]:
# Predict class probabilities
y_prob_over = xgb_clf_over.predict_proba(X_test_scaled)

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob_over[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

plt.figure(figsize = (7, 5))
colors = iter(plt.cm.rainbow(np.linspace(0, 1, n_classes)))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=next(colors), lw = 2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw = 2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBooster Oversampled Multi-class ROC')
plt.legend(loc = "lower right")
plt.show()

## **6.3 PyTorch**

In [None]:
X_test_tensor = torch.tensor(X_test_scaled, dtype = torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype = torch.long)  # Ensure y_test is in the correct format
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size = 64, shuffle = True)  # Adjust batch_size as needed

### **6.3.1 Undersample**

In [None]:
model_under.eval()  # Set the model to evaluation mode
predictions_under = []
with torch.no_grad():  # Do not compute gradients
    for inputs, _ in test_loader:
        outputs = model_under(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predictions_under.extend(predicted.numpy())  # Store predictions

In [None]:
pytorch_score_under = accuracy_score(y_test, predictions_under)
MSE_pytorch_under = mean_squared_error(y_test, predictions_under)
RMSE_pytorch_under = np.sqrt(MSE_pytorch_under)
print ('PyTorch score: ', pytorch_score_under)
print ('Mean square error of PyTorch: ', MSE_pytorch_under)
print ('Root mean squared error of PyTorch:', RMSE_pytorch_under)

In [None]:
y_prob_pytorch_under = []

with torch.no_grad():  # Inference without gradient calculation
    for inputs in DataLoader(X_test_tensor, batch_size = 64):
        outputs = model_under(inputs)
        probabilities = torch.nn.functional.softmax(outputs, dim = 1)
        y_prob_pytorch_under.extend(probabilities.numpy())

y_prob_pytorch_under = np.array(y_prob_pytorch_under)

In [None]:
print(classification_report(y_test, predictions_under))

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob_pytorch_under[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Step 4: Plotting
plt.figure(figsize = (7, 5))
colors = iter(plt.cm.rainbow(np.linspace(0, 1, n_classes)))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color = next(colors), lw = 2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw = 2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('PyTorch Undersampled Multi-class ROC')
plt.legend(loc = "lower right")
plt.show()

### **6.3.2 Oversample**

In [None]:
model_over.eval()  # Set the model to evaluation mode
predictions_over = []
with torch.no_grad():  # Do not compute gradients
    for inputs, _ in test_loader:
        outputs = model_over(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predictions_over.extend(predicted.numpy())  # Store predictions

In [None]:
pytorch_score_over = accuracy_score(y_test, predictions_over)
MSE_pytorch_over = mean_squared_error(y_test, predictions_over)
RMSE_pytorch_over = np.sqrt(MSE_pytorch_over)
print ('PyTorch score: ', pytorch_score_over)
print ('Mean square error of PyTorch: ', MSE_pytorch_over)
print ('Root mean squared error of PyTorch:', RMSE_pytorch_over)

In [None]:
print(classification_report(y_test, predictions_over))

In [None]:
model_over.eval()  # Set the model to evaluation mode
y_prob_pytorch_over = []

with torch.no_grad():  # Inference without gradient calculation
    for inputs in DataLoader(X_test_tensor, batch_size = 64):
        outputs = model_over(inputs)
        probabilities = torch.nn.functional.softmax(outputs, dim = 1)
        y_prob_pytorch_over.extend(probabilities.numpy())

y_prob_pytorch_over = np.array(y_prob_pytorch_over)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob_pytorch_over[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Step 4: Plotting
plt.figure(figsize = (7, 5))
colors = iter(plt.cm.rainbow(np.linspace(0, 1, n_classes)))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color = next(colors), lw = 2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw = 2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('PyTorch Oversampled Multi-class ROC')
plt.legend(loc = "lower right")
plt.show()

### **6.3.3 Original**

In [None]:
model.eval()  # Set the model to evaluation mode
predictions = []
with torch.no_grad():  # Do not compute gradients
    for inputs, _ in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.numpy())  # Store predictions

In [None]:
pytorch_score = accuracy_score(y_test, predictions)
MSE_pytorch = mean_squared_error(y_test, predictions)
RMSE_pytorch = np.sqrt(MSE_pytorch)
print ('PyTorch score: ', pytorch_score)
print ('Mean square error of PyTorch: ', MSE_pytorch)
print ('Root mean squared error of PyTorch:', RMSE_pytorch)

In [None]:
y_prob_pytorch = []

with torch.no_grad():  # Inference without gradient calculation
    for inputs in DataLoader(X_test_tensor, batch_size = 64):
        outputs = model(inputs)
        probabilities = torch.nn.functional.softmax(outputs, dim = 1)
        y_prob_pytorch.extend(probabilities.numpy())

y_prob_pytorch = np.array(y_prob_pytorch)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob_pytorch[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Step 4: Plotting
plt.figure(figsize = (5, 4))
colors = iter(plt.cm.rainbow(np.linspace(0, 1, n_classes)))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color = next(colors), lw = 2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw = 2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC for PyTorch Model')
plt.legend(loc="lower right")
plt.show()

# **7. Feature Importance**

## **7.1 Undersample**

In [None]:
importances = rf_under.feature_importances_
feature_name = X_train.columns.values
indices = np.argsort(importances)[::-1]
plt.figure(1)
plt.bar(feature_name[indices[:20]], importances[indices[:20]])
plt.title('First 20 Important Features In Undersampled')
plt.xticks(rotation = 90)
plt.show()

In [None]:
importances = rf_over.feature_importances_
feature_name = X_train.columns.values
indices = np.argsort(importances)[::-1]
plt.figure(1)
plt.bar(feature_name[indices[:20]], importances[indices[:20]])
plt.title('First 20 Important Features In Oversampled')
plt.xticks(rotation = 90)
plt.show()

# **8. Deeper Analysis About The Classes**

## **8.1 The First Seven Classes**

In [None]:
first_7 = [n for n in range(0, 7)]
first_7_df = spotify_df[spotify_df['genre'].isin(first_7)]

In [None]:
corr_7 = first_7_df.corr()
corr_7

In [None]:
sns.heatmap(corr_7, cmap = "YlGnBu")
plt.title('Correlation For The First Seven Classes')

## **8.2 The Last Three Classes**

In [None]:
last_3 = [7, 8, 9]
last_3_df = spotify_df[spotify_df['genre'].isin(last_3)]

In [None]:
corr_3 = last_3_df.corr()
corr_3

In [None]:
sns.heatmap(corr_3, cmap = "YlGnBu")
plt.title('Correlation For The Last Three Classes')

# **9. Revenue Forecast**

In [None]:
X_test['predict'] = pred_y_xgb_over

In [None]:
test_df = pd.concat([X_test, y_test], axis = 1)
test_df.head()

In [None]:
avg_streams_history = test_df.groupby(by = 'genre')['streams'].mean()
avg_streams_history = pd.DataFrame(avg_streams_history)
avg_streams_history.columns = ['Historical']

In [None]:
avg_streams_predict = test_df.groupby(by = 'predict')['streams'].mean()
avg_streams_predict = pd.DataFrame(avg_streams_predict)
avg_streams_predict.columns = ['Predicted']

## **9.1 Streams Comparison**

In [None]:
genre_list = [genre_mapping[key] for key in sorted(genre_mapping.keys())]

In [None]:
streams_compare = pd.concat([avg_streams_history, avg_streams_predict], axis = 1)
streams_compare.index = genre_list
streams_compare

In [None]:
streams_compare.hvplot.bar(y=['Historical', 'Predicted'], stacked = False, xlabel = 'Genre', ylabel = 'Streams',
                           title = 'Historical vs Predicted Streams Comparison', rot = 45, 
                           width = 800, height = 500).opts(yformatter = '%.0f')

## **9.2 Revenue Comparison**

In [None]:
revenue_per_stream = 0.004  # As an example, using the mid-point of the range
revenue_compare = pd.DataFrame(index = top10_genre)

In [None]:
revenue_compare['Historical'] = streams_compare['Historical'] * revenue_per_stream
revenue_compare['Predicted'] = streams_compare['Predicted'] * revenue_per_stream

In [None]:
revenue_compare.hvplot.bar(y=['Historical', 'Predicted'], stacked = False, xlabel = 'Genre', ylabel = 'Revenue',
                           title = 'Historical vs Predicted Revenue Comparison', rot = 45, 
                           width = 800, height = 500).opts(yformatter = '%.0f')