In [21]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import pickle

### Data preprocessing

In [51]:
data = pd.read_csv('../data/prepared_data_v3.csv')
data.drop(columns=['name_track', 'name_artist', 'release_date', 'genres'], inplace=True)

In [47]:
le_track = LabelEncoder()
le_artist = LabelEncoder()

data['id_track'] = le_track.fit_transform(data['id_track'])
data['id_artist'] = le_artist.fit_transform(data['id_artist'])

In [48]:
with open('../artifacts/preprocessing/le_track.pkl', 'wb') as f:
    pickle.dump(le_track, f)
with open('../artifacts/preprocessing/le_artist.pkl', 'wb') as f:
    pickle.dump(le_artist, f)

In [52]:
no_weeks = data['week_number'].max()
for week in range(no_weeks):
    data.loc[data['week_number'] == week, 'next_week_plays'] = data.loc[data['week_number'] == week+1, 'track_plays'].values

data['next_week_plays'] = data['next_week_plays'].fillna(0).astype(int)
data = data.drop(data[data['week_number'] == no_weeks].index)
data.drop(columns=['week_number'], inplace=True)


In [56]:
data.head()

Unnamed: 0,id_track,popularity,duration_ms,explicit,id_artist,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_plays,artist_plays,next_week_plays
0,0RNxWy0PC3AyH4ThH3aGK6,55,201467,0,19eLuQmk9aCobbVDHc6eek,0.673,0.377,0,-14.141,0.0697,0.586,0.0,0.332,0.713,88.973,61.090909,0,0,0
1,17gxfuiFUrLhbUKdunxUPJ,58,179867,0,19eLuQmk9aCobbVDHc6eek,0.448,0.12,0,-14.089,0.0355,0.877,0.0135,0.1,0.261,86.407,61.090909,0,0,0
2,63kd4m3VFxcJjPVVtbVNAu,53,147000,0,19eLuQmk9aCobbVDHc6eek,0.0,0.405,0,-9.935,0.0,0.842,0.00114,0.198,0.0,0.0,61.090909,0,0,0
3,1qCQTy0fTXerET4x8VHyr9,74,137520,0,19eLuQmk9aCobbVDHc6eek,0.399,0.258,5,-16.028,0.033,0.792,2e-06,0.128,0.192,108.174,61.090909,0,0,0
4,1UH4viviUjZnS9aWgPGrk0,66,204400,0,19eLuQmk9aCobbVDHc6eek,0.507,0.0779,0,-12.099,0.0544,0.866,0.00275,0.108,0.326,70.808,61.090909,0,0,0


In [50]:
features_to_normalize = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 
                         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 
                         'track_plays', 'artist_plays']

scaler = StandardScaler()

data[features_to_normalize] = scaler.fit_transform(data[features_to_normalize])

In [17]:
with open('../artifacts/preprocessing/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [18]:
y = data['next_week_plays'].values
X = data.drop('next_week_plays', axis=1).values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [22]:
train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

In [28]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### Modeling

#### Linear Regression

In [None]:
class LinearRegressionPredictor(nn.Module):
    def __init__(self, num_features: int) -> None:
        super(LinearRegressionPredictor, self).__init__()
        self.layer = nn.Linear(num_features, 1)

        nn.init.uniform_(self.layer.weight, -0.1, 0.1)
        nn.init.constant_(self.layer.bias, 0.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layer(x)

In [None]:
model = LinearRegressionPredictor(X_train.shape[1]).double()

In [None]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)

In [None]:
for epoch in range(10):  # loop over the dataset multiple times
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        
        loss = criterion(outputs, labels.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch} loss: {epoch_loss / len(train_loader)}')

#### Neural Network

In [23]:
from torch import nn, optim

In [24]:
class NNPredictor(nn.Module):
    def __init__(self, num_features, num_hidden) -> None:
        super(NNPredictor, self).__init__()
        self.input_layer = nn.Linear(num_features, num_hidden)
        self.hidden_layer_1 = nn.Linear(num_hidden, num_hidden)
        self.hidden_layer_2 = nn.Linear(num_hidden, num_hidden)
        self.output_layer = nn.Linear(num_hidden, 1)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.input_layer(x))
        x = self.activation(self.hidden_layer_1(x))
        x = self.activation(self.hidden_layer_2(x))
        return self.output_layer(x)

In [25]:
nn_model = NNPredictor(X_train.shape[1], 128).float()

In [26]:
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.0001)

In [29]:
for epoch in range(15):
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = nn_model(inputs)
        
        loss = criterion(outputs, labels.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch} loss: {epoch_loss / len(train_loader)}')

Epoch 0 loss: 5.511981011836573
Epoch 1 loss: 1.657210984875377
Epoch 2 loss: 1.3672918247392536
Epoch 3 loss: 1.4325095283841986
Epoch 4 loss: 0.9332508317830291
Epoch 5 loss: 0.8512723160013382
Epoch 6 loss: 0.756649395007124
Epoch 7 loss: 0.7370581926073624
Epoch 8 loss: 0.7184348494376083
Epoch 9 loss: 0.6676603660583266
Epoch 10 loss: 0.6585863918490192
Epoch 11 loss: 0.6405893315929225
Epoch 12 loss: 0.6109419554122998
Epoch 13 loss: 0.6360559521433244
Epoch 14 loss: 0.6268114371620318


In [30]:
import numpy as np
nn_model.eval()
y_pred = np.array([])
with torch.no_grad():
    for i, (inputs, labels) in enumerate(val_loader):
        outputs = nn_model(inputs)
        y_pred = np.append(y_pred, outputs.squeeze(1).numpy())


print('Test RMSE: ', mean_squared_error(y_val, y_pred, squared=False))

Test RMSE:  2.5692738876541914


In [31]:
torch.save(nn_model.state_dict(), '../artifacts/models/nn_regressor.pth')

#### XGBoost

In [32]:
import xgboost as xgb
from pathlib import Path

In [33]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=6, learning_rate=0.01)

In [34]:
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=1)



[0]	validation_0-rmse:1.80653
[1]	validation_0-rmse:1.79054
[2]	validation_0-rmse:1.77473
[3]	validation_0-rmse:1.75909
[4]	validation_0-rmse:1.74363
[5]	validation_0-rmse:1.72834
[6]	validation_0-rmse:1.71322
[7]	validation_0-rmse:1.69827
[8]	validation_0-rmse:1.68349
[9]	validation_0-rmse:1.66887
[10]	validation_0-rmse:1.65442
[11]	validation_0-rmse:1.64013
[12]	validation_0-rmse:1.62600
[13]	validation_0-rmse:1.61203
[14]	validation_0-rmse:1.59822
[15]	validation_0-rmse:1.58457
[16]	validation_0-rmse:1.57107
[17]	validation_0-rmse:1.55772
[18]	validation_0-rmse:1.54453
[19]	validation_0-rmse:1.53149
[20]	validation_0-rmse:1.51860
[21]	validation_0-rmse:1.50585
[22]	validation_0-rmse:1.49325
[23]	validation_0-rmse:1.48080
[24]	validation_0-rmse:1.46849
[25]	validation_0-rmse:1.45632
[26]	validation_0-rmse:1.44430
[27]	validation_0-rmse:1.43241
[28]	validation_0-rmse:1.42066
[29]	validation_0-rmse:1.40905
[30]	validation_0-rmse:1.39758
[31]	validation_0-rmse:1.38624
[32]	validation_0-

In [35]:
y_pred = xgb_model.predict(X_test)

print('Test RMSE: ', mean_squared_error(y_test, y_pred, squared=False))

Test RMSE:  0.6138859501552587


In [36]:
filename = Path('../artifacts/models/xgb_regressor.model')
xgb_model.save_model(filename)

In [None]:
print(y_pred[:10])

In [38]:
# drivers license - most popular song
original_id = '7lPN2DXiMsVn7XUKtOW1CS'

encoded_id = le_track.transform([original_id])[0]

matching_rows = data[data['id_track'] == encoded_id]

In [42]:
matching_rows.head(1)

Unnamed: 0,id_track,popularity,duration_ms,explicit,id_artist,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_plays,artist_plays,next_week_plays
19152,21733,4.674961,0.164725,1,233,-0.089072,-1.011666,1.326645,-0.418611,-0.255432,1.641558,-0.263093,-0.52484,-1.578918,0.741596,4.811525,9.073875,1.43149,19


In [39]:
datapoint = matching_rows.iloc[0].values[:-1]

In [40]:
nn_model.eval()
with torch.no_grad():
    output = nn_model(torch.from_numpy(datapoint).float())
    print(output)

tensor([26.9507])


In [44]:
org_data = pd.read_csv('../data/prepared_data_v3.csv')

In [45]:
org_data.iloc[19152]

id_track               7lPN2DXiMsVn7XUKtOW1CS
name_track                    drivers license
popularity                                 99
duration_ms                            242014
explicit                                    1
id_artist              1McMsnEElThX1knmY4oliG
release_date                       2021-01-08
danceability                            0.585
energy                                  0.436
key                                        10
loudness                               -8.761
speechiness                            0.0601
acousticness                            0.721
instrumentalness                     0.000013
liveness                                0.105
valence                                 0.132
tempo                                 143.874
name_artist                    Olivia Rodrigo
genres               ['pop', 'post-teen pop']
artist_popularity                        82.2
track_plays                                17
artist_plays                      