In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import pickle
import xgboost as xgb
from pathlib import Path

### Data preprocessing

In [2]:
data = pd.read_csv('../data/prepared_data_v3.csv')
data.drop(columns=['name_track', 'name_artist', 'release_date', 'genres'], inplace=True)

In [3]:
le_track = LabelEncoder()
le_artist = LabelEncoder()

data['id_track'] = le_track.fit_transform(data['id_track'])
data['id_artist'] = le_artist.fit_transform(data['id_artist'])

In [4]:
with open('../artifacts/preprocessing/le_track.pkl', 'wb') as f:
    pickle.dump(le_track, f)
with open('../artifacts/preprocessing/le_artist.pkl', 'wb') as f:
    pickle.dump(le_artist, f)

In [5]:
no_weeks = data['week_number'].max()
for week in range(no_weeks):
    data.loc[data['week_number'] == week, 'next_week_plays'] = data.loc[data['week_number'] == week+1, 'track_plays'].values

data['next_week_plays'] = data['next_week_plays'].fillna(0).astype(int)
data = data.drop(data[data['week_number'] == no_weeks].index)
data.drop(columns=['week_number'], inplace=True)


In [6]:
data.head()

Unnamed: 0,id_track,popularity,duration_ms,explicit,id_artist,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_plays,artist_plays,next_week_plays
0,1262,55,201467,0,200,0.673,0.377,0,-14.141,0.0697,0.586,0.0,0.332,0.713,88.973,61.090909,0,0,0
1,3316,58,179867,0,200,0.448,0.12,0,-14.089,0.0355,0.877,0.0135,0.1,0.261,86.407,61.090909,0,0,0
2,17586,53,147000,0,200,0.0,0.405,0,-9.935,0.0,0.842,0.00114,0.198,0.0,0.0,61.090909,0,0,0
3,5396,74,137520,0,200,0.399,0.258,5,-16.028,0.033,0.792,2e-06,0.128,0.192,108.174,61.090909,0,0,0
4,4393,66,204400,0,200,0.507,0.0779,0,-12.099,0.0544,0.866,0.00275,0.108,0.326,70.808,61.090909,0,0,0


In [7]:
features_to_normalize = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 
                         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 
                         'track_plays', 'artist_plays']

scaler = StandardScaler()

data[features_to_normalize] = scaler.fit_transform(data[features_to_normalize])

In [8]:
with open('../artifacts/preprocessing/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [9]:
y = data['next_week_plays'].values
X = data.drop('next_week_plays', axis=1).values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [11]:
train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

In [12]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### Modeling

#### Linear Regression

In [17]:
class LinearRegressionPredictor(nn.Module):
    def __init__(self, num_features: int) -> None:
        super(LinearRegressionPredictor, self).__init__()
        self.layer = nn.Linear(num_features, 1)

        nn.init.uniform_(self.layer.weight, -0.1, 0.1)
        nn.init.constant_(self.layer.bias, 0.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layer(x)

In [18]:
model = LinearRegressionPredictor(X_train.shape[1])

In [19]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)

In [20]:
for epoch in range(10):
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()

        outputs = model(inputs)
        
        loss = criterion(outputs, labels.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch} loss: {epoch_loss / len(train_loader)}')

Epoch 0 loss: nan
Epoch 1 loss: nan
Epoch 2 loss: nan
Epoch 3 loss: nan
Epoch 4 loss: nan
Epoch 5 loss: nan
Epoch 6 loss: nan
Epoch 7 loss: nan
Epoch 8 loss: nan
Epoch 9 loss: nan


#### Neural Network

In [22]:
class NNPredictor(nn.Module):
    def __init__(self, num_features, num_hidden) -> None:
        super(NNPredictor, self).__init__()
        self.input_layer = nn.Linear(num_features, num_hidden)
        self.hidden_layer_1 = nn.Linear(num_hidden, num_hidden)
        self.hidden_layer_2 = nn.Linear(num_hidden, num_hidden)
        self.output_layer = nn.Linear(num_hidden, 1)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.input_layer(x))
        x = self.activation(self.hidden_layer_1(x))
        x = self.activation(self.hidden_layer_2(x))
        return self.output_layer(x)

In [23]:
nn_model = NNPredictor(X_train.shape[1], 128).float()

In [24]:
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.0001)

In [25]:
for epoch in range(15):
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()

        outputs = nn_model(inputs)
        
        loss = criterion(outputs, labels.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch} loss: {epoch_loss / len(train_loader)}')

Epoch 0 loss: 3.9870061107341686
Epoch 1 loss: 1.5895029223942008
Epoch 2 loss: 1.0789377526425072
Epoch 3 loss: 1.2232236589304983
Epoch 4 loss: 0.9372224525354215
Epoch 5 loss: 0.834023672474661
Epoch 6 loss: 0.7608101785094771
Epoch 7 loss: 0.7696647046671796
Epoch 8 loss: 0.6910719545250038
Epoch 9 loss: 0.6803542912090125
Epoch 10 loss: 0.6343899124637077
Epoch 11 loss: 0.658352235959992
Epoch 12 loss: 0.6300027976520965
Epoch 13 loss: 0.6160819699892356
Epoch 14 loss: 0.6379005396627971


In [32]:
import numpy as np
nn_model.eval()
y_pred = np.array([])
with torch.no_grad():
    for i, (inputs, labels) in enumerate(val_loader):
        outputs = nn_model(inputs)
        y_pred = np.append(y_pred, outputs.squeeze(1).numpy())


print('Validation RMSE: ', mean_squared_error(y_val, y_pred, squared=False))

Validation RMSE:  2.378784665023856


In [27]:
torch.save(nn_model.state_dict(), '../artifacts/models/nn_regressor.pth')

#### XGBoost

In [29]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=8, learning_rate=0.01)

In [30]:
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=1)



[0]	validation_0-rmse:1.80652
[1]	validation_0-rmse:1.79051
[2]	validation_0-rmse:1.77469
[3]	validation_0-rmse:1.75903
[4]	validation_0-rmse:1.74355
[5]	validation_0-rmse:1.72825
[6]	validation_0-rmse:1.71311
[7]	validation_0-rmse:1.69815
[8]	validation_0-rmse:1.68335
[9]	validation_0-rmse:1.66872
[10]	validation_0-rmse:1.65425
[11]	validation_0-rmse:1.63994
[12]	validation_0-rmse:1.62580
[13]	validation_0-rmse:1.61181
[14]	validation_0-rmse:1.59799
[15]	validation_0-rmse:1.58431
[16]	validation_0-rmse:1.57080
[17]	validation_0-rmse:1.55744
[18]	validation_0-rmse:1.54423
[19]	validation_0-rmse:1.53117
[20]	validation_0-rmse:1.51826
[21]	validation_0-rmse:1.50550
[22]	validation_0-rmse:1.49289
[23]	validation_0-rmse:1.48042
[24]	validation_0-rmse:1.46810
[25]	validation_0-rmse:1.45591
[26]	validation_0-rmse:1.44388
[27]	validation_0-rmse:1.43198
[28]	validation_0-rmse:1.42022
[29]	validation_0-rmse:1.40859
[30]	validation_0-rmse:1.39710
[31]	validation_0-rmse:1.38575
[32]	validation_0-

In [36]:
y_pred = xgb_model.predict(X_val)

print('Validation RMSE: ', mean_squared_error(y_val, y_pred, squared=False))

Validation RMSE:  0.5968891546239947


In [34]:
filename = Path('../artifacts/models/xgb_regressor.model')
xgb_model.save_model(filename)

### Model evaluation

In [35]:
nn_model.eval()
y_pred = np.array([])
with torch.no_grad():
    for i, (inputs, labels) in enumerate(test_loader):
        outputs = nn_model(inputs)
        y_pred = np.append(y_pred, outputs.squeeze(1).numpy())


print('Neural Network Test RMSE: ', mean_squared_error(y_test, y_pred, squared=False))

Neural Network Test RMSE:  2.4448367441098537


In [37]:
y_pred = xgb_model.predict(X_test)

print('XGBoostRegressor Validation RMSE: ', mean_squared_error(y_test, y_pred, squared=False))

XGBoostRegressor Validation RMSE:  0.6175303550758209
