In [97]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import pickle

### Data preprocessing

In [34]:
data = pd.read_csv('../data/prepared_data_v3.csv')
data.drop(columns=['name_track', 'name_artist', 'release_date', 'genres'], inplace=True)

In [35]:
le_track = LabelEncoder()
le_artist = LabelEncoder()

data['id_track'] = le_track.fit_transform(data['id_track'])
data['id_artist'] = le_artist.fit_transform(data['id_artist'])

In [40]:
with open('../artifacts/preprocessing/le_track.pkl', 'wb') as f:
    pickle.dump(le_track, f)
with open('../artifacts/preprocessing/le_artist.pkl', 'wb') as f:
    pickle.dump(le_artist, f)

# load with
# with open('../artifacts/preprocessing/le_track.pkl', 'rb') as f:
#     le_track = pickle.load(f)
# with open('../artifacts/preprocessing/le_artist.pkl', 'rb') as f:
#     le_artist = pickle.load(f)

In [36]:
no_weeks = data['week_number'].max()
for week in range(no_weeks):
    #data[data['week_number'] == week]['next_week_plays'] = data[data['week_number'] == week+1]['track_plays']
    data.loc[data['week_number'] == week, 'next_week_plays'] = data.loc[data['week_number'] == week+1, 'track_plays'].values

data['next_week_plays'] = data['next_week_plays'].fillna(0).astype(int)
data.drop(columns=['week_number'], inplace=True)


In [37]:
data = data.drop(data[data['week_number'] == no_weeks].index)

In [38]:
features_to_normalize = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 
                         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 
                         'track_plays', 'artist_plays']

scaler = StandardScaler()

data[features_to_normalize] = scaler.fit_transform(data[features_to_normalize])

In [41]:
with open('../artifacts/preprocessing/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [42]:
y = data['next_week_plays'].values
X = data.drop('next_week_plays', axis=1).values

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [44]:
train_dataset = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = TensorDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

In [114]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### Modeling

#### Linear Regression

In [None]:
class LinearRegressionPredictor(nn.Module):
    def __init__(self, num_features: int) -> None:
        super(LinearRegressionPredictor, self).__init__()
        self.layer = nn.Linear(num_features, 1)

        nn.init.uniform_(self.layer.weight, -0.1, 0.1)
        nn.init.constant_(self.layer.bias, 0.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layer(x)

In [None]:
model = LinearRegressionPredictor(X_train.shape[1]).double()

In [None]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)

In [None]:
for epoch in range(10):  # loop over the dataset multiple times
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        
        loss = criterion(outputs, labels.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch} loss: {epoch_loss / len(train_loader)}')

#### Neural Network

In [85]:
class NNPredictor(nn.Module):
    def __init__(self, num_features, num_hidden) -> None:
        super(NNPredictor, self).__init__()
        self.input_layer = nn.Linear(num_features, num_hidden)
        self.hidden_layer_1 = nn.Linear(num_hidden, num_hidden)
        self.hidden_layer_2 = nn.Linear(num_hidden, num_hidden)
        self.output_layer = nn.Linear(num_hidden, 1)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.input_layer(x))
        x = self.activation(self.hidden_layer_1(x))
        x = self.activation(self.hidden_layer_2(x))
        return self.output_layer(x)

In [86]:
nn_model = NNPredictor(X_train.shape[1], 128).float()

In [87]:
criterion = nn.MSELoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.0001)

In [88]:
for epoch in range(10):
    epoch_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = nn_model(inputs)
        
        loss = criterion(outputs, labels.unsqueeze(1))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch} loss: {epoch_loss / len(train_loader)}')

Epoch 0 loss: 3.897562338407429
Epoch 1 loss: 1.7935495885485637
Epoch 2 loss: 1.290972146829469
Epoch 3 loss: 1.1802575443820975
Epoch 4 loss: 0.9421254186614527
Epoch 5 loss: 0.92620422007754
Epoch 6 loss: 0.8395765460225116
Epoch 7 loss: 0.7647431578008171
Epoch 8 loss: 0.7086922119989825
Epoch 9 loss: 0.7086464180387523


In [115]:
import numpy as np
nn_model.eval()
y_pred = np.array([])
with torch.no_grad():
    for i, (inputs, labels) in enumerate(val_loader):
        outputs = nn_model(inputs)
        y_pred = np.append(y_pred, outputs.squeeze(1).numpy())


print('Test RMSE: ', mean_squared_error(y_val, y_pred, squared=False))

Test RMSE:  2.540753063680413


In [117]:
torch.save(nn_model.state_dict(), '../artifacts/models/nn_regressor.pth')

#### XGBoost

In [90]:
import xgboost as xgb
from pathlib import Path

In [52]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=6, learning_rate=0.01)

In [53]:
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=1)



[0]	validation_0-rmse:1.80631
[1]	validation_0-rmse:1.79009
[2]	validation_0-rmse:1.77406
[3]	validation_0-rmse:1.75820
[4]	validation_0-rmse:1.74252
[5]	validation_0-rmse:1.72702
[6]	validation_0-rmse:1.71169
[7]	validation_0-rmse:1.69653
[8]	validation_0-rmse:1.68154
[9]	validation_0-rmse:1.66671
[10]	validation_0-rmse:1.65205
[11]	validation_0-rmse:1.63756
[12]	validation_0-rmse:1.62323
[13]	validation_0-rmse:1.60906
[14]	validation_0-rmse:1.59505
[15]	validation_0-rmse:1.58121
[16]	validation_0-rmse:1.56752
[17]	validation_0-rmse:1.55399
[18]	validation_0-rmse:1.54061
[19]	validation_0-rmse:1.52739
[20]	validation_0-rmse:1.51431
[21]	validation_0-rmse:1.50138
[22]	validation_0-rmse:1.48860
[23]	validation_0-rmse:1.47597
[24]	validation_0-rmse:1.46348
[25]	validation_0-rmse:1.45114
[26]	validation_0-rmse:1.43893
[27]	validation_0-rmse:1.42690
[28]	validation_0-rmse:1.41500
[29]	validation_0-rmse:1.40324
[30]	validation_0-rmse:1.39162
[31]	validation_0-rmse:1.38014
[32]	validation_0-

In [57]:
y_pred = xgb_model.predict(X_test)

print('Test RMSE: ', mean_squared_error(y_test, y_pred, squared=False))

Test RMSE:  0.5794197154325186


In [92]:
filename = Path('../artifacts/models/xgb_regressor.model')
xgb_model.save_model(filename)

In [58]:
print(y_pred[:10])

[0.00739053 0.6772501  0.5968689  0.10253602 0.00555139 0.00811028
 0.00684761 0.7391458  0.00846094 0.00596823]


In [74]:
# drivers license - most popular song
original_id = '7lPN2DXiMsVn7XUKtOW1CS'

encoded_id = le_track.transform([original_id])[0]

matching_rows = data[data['id_track'] == encoded_id]

In [95]:
datapoint = matching_rows.iloc[0].values[:-1]

In [96]:
nn_model.eval()
with torch.no_grad():
    output = nn_model(torch.from_numpy(datapoint).float())
    print(output)

tensor([24.0818])
