In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, text as sql_text
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
from crud.crud import get_yaml
import joblib

from tqdm import tqdm

In [28]:
params = get_yaml('./params/params.yaml')['connection']
engine = create_engine(f"postgresql://{params['user']}:{params['password']}@{params['host']}:{5432}/{params['database']}")
conn = engine.connect()

In [30]:
query = "SELECT * FROM spotify_charts"
df = pd.read_sql(sql=sql_text(query), con=conn)

df.head(10)

Unnamed: 0,title,rank,date,artist,region,chart,trend,streams
0,R.I.P. (feat. Rita Ora & Anitta),182,2019-05-25,Sofía Reyes,Guatemala,top200,MOVE_UP,2363.0
1,One Kiss (with Dua Lipa),183,2019-05-25,Calvin Harris,Guatemala,top200,MOVE_DOWN,2341.0
2,Cuando Nadie Ve,184,2019-05-25,Morat,Guatemala,top200,MOVE_DOWN,2341.0
3,Estamos Bien,185,2019-05-25,Bad Bunny,Guatemala,top200,NEW_ENTRY,2338.0
4,when the partys over,186,2019-05-25,Billie Eilish,Guatemala,top200,MOVE_DOWN,2337.0
5,Cómo Te Voy A Olvidar,187,2019-05-25,Los Ángeles Azules,Guatemala,top200,NEW_ENTRY,2327.0
6,Yo No Sé Mañana (salsa),188,2019-05-25,Luis Enrique,Guatemala,top200,MOVE_UP,2311.0
7,Devuélveme a mi chica,189,2019-05-25,Hombres G,Guatemala,top200,MOVE_UP,2269.0
8,On My Way,190,2019-05-25,"Alan Walker, Sabrina Carpenter, Farruko",Guatemala,top200,MOVE_DOWN,2263.0
9,Procura,191,2019-05-25,ChiChi Peralta,Guatemala,top200,MOVE_UP,2247.0


In [31]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['date'] = df.date.values.astype(np.int64) // 10 ** 9

In [32]:
encoder = LabelEncoder()
features = list(df.select_dtypes(['object']).columns)

for feature in features:
    df[feature] = encoder.fit_transform(df[feature])

In [33]:
scaler = StandardScaler()
nums = ["title","date","artist","trend","rank"]

df[nums] = scaler.fit_transform(df[nums])

In [34]:
df.head(10)

Unnamed: 0,title,rank,date,artist,region,chart,trend,streams
0,0.528745,1.47899,-0.175968,1.226363,23,0,0.159332,2363.0
1,0.309748,1.496367,-0.175968,-1.168234,23,0,-0.874867,2341.0
2,-1.151898,1.513744,-0.175968,0.526779,23,0,-0.874867,2341.0
3,-0.882091,1.531121,-0.175968,-1.366021,23,0,1.19353,2338.0
4,1.453651,1.548498,-0.175968,-1.296317,23,0,-0.874867,2337.0
5,-1.139806,1.565874,-0.175968,0.210566,23,0,1.19353,2327.0
6,1.352976,1.583251,-0.175968,0.237254,23,0,0.159332,2311.0
7,-1.060085,1.600628,-0.175968,-0.39324,23,0,0.159332,2269.0
8,0.303013,1.618005,-0.175968,-1.564158,23,0,-0.874867,2263.0
9,0.485497,1.635382,-0.175968,-1.103534,23,0,0.159332,2247.0


In [35]:
X = df.drop(columns=["chart", "title"])
Y = df["streams"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3003)

X_train.head(10)

Unnamed: 0,rank,date,artist,region,trend,streams
586783,0.592771,-0.123343,1.182381,14,0.159332,4307.0
12322788,-0.866884,1.400843,-1.366021,8,-0.874867,79050.0
1809882,-0.206564,0.06182,0.281938,58,2.227729,28509.0
18093706,-1.092783,-0.394266,-1.513066,0,-0.874867,92509.0
4903681,1.305222,0.482823,1.144281,62,0.159332,23928.0
2285759,1.305222,0.174867,1.698224,1,0.159332,30967.0
12186296,-1.214421,-1.306439,-0.230569,10,2.227729,7751.0
18503941,1.0967,-0.269525,1.720873,66,0.159332,220318.0
8615268,-1.475074,1.016873,-0.377351,43,0.159332,52796.0
2353028,-1.457697,0.118344,1.154377,19,2.227729,152960.0


In [36]:
model=GradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.01,
    n_estimators=100,
    max_depth=4,
    random_state=42,
    verbose=1
)

model.fit(X_train, Y_train)

      Iter       Train Loss   Remaining Time 
         1 43127207423.6072           55.50m
         2 42279835592.2246           54.67m
         3 41449380227.1125           53.75m
         4 40635121292.9125           53.18m
         5 39836932579.5428           53.66m
         6 39054775630.4309           54.51m
         7 38287652246.7020           54.26m
         8 37535653096.4965           54.32m
         9 36798836117.4734           53.74m
        10 36076054405.8011           52.99m
        20 29586680864.7220           47.30m
        30 24264997175.1475           40.83m
        40 19900176852.6346           34.75m
        50 16321643543.5536           28.70m
        60 13387217677.2429           22.70m
        70 10980997115.2730           16.85m
        80  9008311078.7499           11.17m
        90  7390531040.9216            5.56m
       100  6063984119.8707            0.00s


In [37]:
model.score(X_test, Y_test)

0.8629703330995391

In [38]:
joblib.dump(model, './model/model.pkl')
loaded_model = joblib.load("./model/model.pkl")

print(f'R^2 score: {r2_score(Y_test, loaded_model.predict(X_test)):.6f}')

R^2 score: 0.862970
