In [3]:
import torch
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle
from torch import nn
from pathlib import Path

#### Load Preprocessing

In [2]:
with open('../artifacts/preprocessing/le_track.pkl', 'rb') as f:
    le_track = pickle.load(f)
with open('../artifacts/preprocessing/le_artist.pkl', 'rb') as f:
    le_artist = pickle.load(f)
with open('../artifacts/preprocessing/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)


#### Load NN Model

In [19]:
class NNPredictor(nn.Module):
    def __init__(self, num_features, num_hidden) -> None:
        super(NNPredictor, self).__init__()
        self.input_layer = nn.Linear(num_features, num_hidden)
        self.hidden_layer_1 = nn.Linear(num_hidden, num_hidden)
        self.hidden_layer_2 = nn.Linear(num_hidden, num_hidden)
        self.output_layer = nn.Linear(num_hidden, 1)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.input_layer(x))
        x = self.activation(self.hidden_layer_1(x))
        x = self.activation(self.hidden_layer_2(x))
        return self.output_layer(x)

In [20]:
nn_model = NNPredictor(18, 128).float()

In [21]:
nn_model.load_state_dict(torch.load('../artifacts/models/nn_regressor.pth'))
nn_model.eval()

NNPredictor(
  (input_layer): Linear(in_features=18, out_features=128, bias=True)
  (hidden_layer_1): Linear(in_features=128, out_features=128, bias=True)
  (hidden_layer_2): Linear(in_features=128, out_features=128, bias=True)
  (output_layer): Linear(in_features=128, out_features=1, bias=True)
  (activation): ReLU()
)

#### Load XGBoost Model

In [6]:
xgb_model_filename = Path('../artifacts/models/xgb_regressor.model')
xgb_model = xgb.XGBRegressor()
xgb_model.load_model(xgb_model_filename)

#### Prepare data

##### Sample week data

In [8]:
data = pd.read_csv('../data/prepared_data_v3.csv')
no_weeks = data['week_number'].max()
for week in range(no_weeks):
    data.loc[data['week_number'] == week, 'next_week_plays'] = data.loc[data['week_number'] == week+1, 'track_plays'].values

data['next_week_plays'] = data['next_week_plays'].fillna(0).astype(int)
data = data.drop(data[data['week_number'] == no_weeks].index)

In [9]:
sample_data = data[data['week_number'] == 50].copy()

In [16]:
reference_data = data[data['week_number'] == 50].copy()

In [10]:
sample_data.drop(columns=['week_number'], inplace=True)

In [11]:
sample_data.drop(columns=['name_track', 'name_artist', 'release_date', 'genres'], inplace=True)

In [12]:
sample_data.head(2)

Unnamed: 0,id_track,popularity,duration_ms,explicit,id_artist,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_plays,artist_plays,next_week_plays
1120600,0RNxWy0PC3AyH4ThH3aGK6,55,201467,0,19eLuQmk9aCobbVDHc6eek,0.673,0.377,0,-14.141,0.0697,0.586,0.0,0.332,0.713,88.973,61.090909,0,0,0
1120601,17gxfuiFUrLhbUKdunxUPJ,58,179867,0,19eLuQmk9aCobbVDHc6eek,0.448,0.12,0,-14.089,0.0355,0.877,0.0135,0.1,0.261,86.407,61.090909,0,0,0


In [13]:
sample_data['id_track'] = le_track.fit_transform(sample_data['id_track'])
sample_data['id_artist'] = le_artist.fit_transform(sample_data['id_artist'])

In [14]:
features_to_normalize = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 
                         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 
                         'track_plays', 'artist_plays']

sample_data[features_to_normalize] = scaler.fit_transform(sample_data[features_to_normalize])

In [35]:
real_plays = sample_data['next_week_plays'].values
input_data = sample_data.drop('next_week_plays', axis=1).values

##### Sample datapoint

In [27]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    processed_data = data.copy()
    processed_data['id_track'] = le_track.transform(data['id_track'])
    processed_data['id_artist'] = le_artist.transform(data['id_artist'])
    processed_data[features_to_normalize] = scaler.transform(data[features_to_normalize])
    return processed_data

In [28]:
labels = ['id_track', 'popularity', 'duration_ms', 'explicit', 'id_artist', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'artist_popularity', 'track_plays', 'artist_plays']

In [29]:
datapoint = ['7lPN2DXiMsVn7XUKtOW1CS', 99, 242014, 1, '1McMsnEElThX1knmY4oliG', 0.585, 0.436, 10, -8.761, 0.0601, 0.721, 0.000013, 0.105, 0.132, 143.874, 82.2, 17, 54]

In [30]:
datapoint_df = pd.DataFrame([datapoint], columns=labels)

In [31]:
datapoint_df.head()

Unnamed: 0,id_track,popularity,duration_ms,explicit,id_artist,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_popularity,track_plays,artist_plays
0,7lPN2DXiMsVn7XUKtOW1CS,99,242014,1,1McMsnEElThX1knmY4oliG,0.585,0.436,10,-8.761,0.0601,0.721,1.3e-05,0.105,0.132,143.874,82.2,17,54


#### Inference

##### Torch Predictions

In [17]:
torch_input_data = torch.from_numpy(input_data).float()

In [22]:
predictions = nn_model.forward(torch_input_data).detach().numpy().flatten()

In [23]:
predicted_plays = np.ceil(predictions).astype(int)

In [24]:
indices = np.argsort(predicted_plays)[-50:]

# Sort the array in descending order
sorted_array = np.sort(predicted_plays)[-50:][::-1]

# Print the sorted array and corresponding indices
print(sorted_array)
print(indices)

[40 39 37 36 36 35 35 35 34 34 34 34 34 33 33 33 33 33 32 32 32 32 32 32
 32 32 32 32 31 31 31 31 30 30 29 29 29 29 29 28 28 28 28 28 28 28 28 28
 28 28]
[17963 18532 18510 17972 18039 16894 19870 18964 13396 14463 18995 11306
 15386 10907 14455 18994  8328 18237 14541 15911 16775 18509 17235 13685
 15021 19545 18523 18130 18660 14631 19561 17342 10744 14611 19166 14633
 16276 19141 19153 15917 19057 13860 12955 19751 18002 19558 15916 19152
 15020 18659]


In [25]:
predicted_tracks = reference_data.iloc[indices]['name_track'].values
predicted_tracks

In [32]:
input_datapoint = preprocess_data(datapoint_df)

In [33]:
single_prediction = nn_model.forward(torch.from_numpy(input_datapoint.values).float()).detach().numpy().flatten()

In [34]:
single_prediction

array([27.202284], dtype=float32)

##### XGBoost Predictions

In [38]:
predictions = xgb_model.predict(input_data)

In [39]:
predicted_plays = np.ceil(predictions).astype(int)

In [40]:
indices = np.argsort(predicted_plays)[-50:]

# Sort the array in descending order
sorted_array = np.sort(predicted_plays)[-50:][::-1]

# Print the sorted array and corresponding indices
print(sorted_array)
print(indices)

[29 28 28 28 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27
 27 27 27 27 27 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26
 26 26]
[13815 15917 18994 15911 15398 12776 15021 15020 18682 11944 13859 16584
 18964 13700 16596 19855 17973 13869 16330 19870 17222  8328 14633 14632
 14631 14611 18660 16473 15386 18659 18726 18237 18509 11305 19562 17223
 17235 19858 18523 11306 13154 10907 16441 18002 18001 10744 12955 17224
 11173 19751]


In [42]:
predicted_tracks = reference_data.iloc[indices]['name_track'].values
predicted_tracks

array(['The Nights', '34+35', 'Heather', '7 rings', 'Film out',
       'What’s Next', "Say You Won't Let Go", 'Train Wreck', 'Up',
       'Hayloft', 'Perfect', 'SICKO MODE', 'My Head & My Heart',
       'La Tóxica', 'HIGHEST IN THE ROOM', 'Astronaut In The Ocean',
       'Hope', 'Afterglow', 'Streets', 'Tapão Na Raba', 'bad guy',
       'Yellow', 'In Your Eyes', 'Blinding Lights', 'Save Your Tears',
       'The Hills', 'Before You Go', 'hot girl bummer', 'Dynamite',
       'Someone You Loved', 'ROXANNE', 'you broke me first',
       'Lucid Dreams', 'Do I Wanna Know?', 'Coração Na Cama',
       'everything i wanted', 'Therefore I Am',
       'Batom de Cereja - Ao Vivo', 'Robbery',
       "Why'd You Only Call Me When You're High?", 'Stressed Out',
       "Cupid's Chokehold / Breakfast in America", 'Electric Love',
       'Golden', 'Watermelon Sugar', 'Snowman', 'Hold On',
       "when the party's over", 'All of Me', '911'], dtype=object)