In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, text as sql_text
from sklearn.model_selection import train_test_split, GridSearchCV, validation_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
from crud.crud import get_yaml
import joblib

from tqdm import tqdm

In [2]:
params = get_yaml('./params/params.yaml')['connection']
engine = create_engine(f"postgresql://{params['user']}:{params['password']}@{params['host']}:{5432}/{params['database']}")
conn = engine.connect()

In [3]:
query = "SELECT * FROM spotify_charts"
df = pd.read_sql(sql=sql_text(query), con=conn)

df.head(10)

Unnamed: 0,title,rank,date,artist,region,chart,trend,streams
0,ME! (feat. Brendon Urie of Panic! At The Disco),18,2019-08-27,Taylor Swift,Taiwan,top200,MOVE_UP,21593.0
1,绿色,19,2019-08-27,Shirley Chen,Taiwan,top200,MOVE_UP,21306.0
2,最後一次,20,2019-08-27,高爾宣 OSN,Taiwan,top200,MOVE_UP,19891.0
3,Goodbyes (Feat. Young Thug),21,2019-08-27,Post Malone,Taiwan,top200,MOVE_DOWN,19857.0
4,路過人間 (電視劇《我們與惡的距離》插曲),22,2019-08-27,Yisa Yu,Taiwan,top200,MOVE_UP,19670.0
5,你的酒馆对我打了烊,23,2019-08-27,Shirley Chen,Taiwan,top200,MOVE_UP,18581.0
6,I.F.L.Y.,24,2019-08-27,Bazzi,Taiwan,top200,SAME_POSITION,18175.0
7,i GO,25,2019-08-27,NICKTHEREAL,Taiwan,top200,MOVE_UP,17847.0
8,I Dont Care (with Justin Bieber),26,2019-08-27,Ed Sheeran,Taiwan,top200,MOVE_UP,17790.0
9,Cruel Summer,27,2019-08-27,Taylor Swift,Taiwan,top200,MOVE_DOWN,17785.0


In [4]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['date'] = df.date.values.astype(np.int64) // 10 ** 9

In [5]:
encoder = LabelEncoder()
features = list(df.select_dtypes(['object']).columns)

for feature in features:
    df[feature] = encoder.fit_transform(df[feature])

In [6]:
scaler = StandardScaler()
nums = ["title","date","artist","trend","rank"]

df[nums] = scaler.fit_transform(df[nums])

In [7]:
df.head(10)

Unnamed: 0,title,rank,date,artist,region,chart,trend,streams
0,-0.075072,-1.370813,0.007246,1.346633,60,0,0.159332,21593.0
1,1.821338,-1.353436,0.007246,1.162717,60,0,0.159332,21306.0
2,1.784352,-1.336059,0.007246,1.837193,60,0,0.159332,19891.0
3,-0.681055,-1.318682,0.007246,0.845889,60,0,-0.874867,19857.0
4,1.836463,-1.301305,0.007246,1.651609,60,0,0.159332,19670.0
5,1.701204,-1.283928,0.007246,1.162717,60,0,0.159332,18581.0
6,-0.500501,-1.266551,0.007246,-1.341001,60,0,2.227729,18175.0
7,1.418911,-1.249175,0.007246,0.575151,60,0,0.159332,17847.0
8,-0.52181,-1.231798,0.007246,-0.791798,60,0,0.159332,17790.0
9,-1.156428,-1.214421,0.007246,1.346633,60,0,-0.874867,17785.0


In [8]:
X = df.drop(columns=["chart", "title", "streams"], axis=1)
Y = df["streams"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3003)

X_train.head(10)

Unnamed: 0,rank,date,artist,region,trend
586783,1.235714,0.284016,0.210918,10,0.159332
12322788,0.279988,-1.039414,-0.144888,7,0.159332
1809882,1.600628,0.246983,1.477964,31,-0.874867
18093706,-0.936391,-0.138936,1.618776,64,0.159332
4903681,-0.432463,0.921368,0.160176,35,-0.874867
2285759,-0.154433,0.476976,-0.907152,29,-0.874867
12186296,0.644902,-1.043312,-0.794695,25,0.159332
18503941,0.471133,-0.164274,-0.309754,65,-0.874867
8615268,-0.067549,1.315083,-1.43827,40,0.159332
2353028,-0.310825,0.422401,-1.181665,12,0.159332


In [9]:
model = GradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.01,
    n_estimators=100,
    max_depth=4,
    random_state=42,
    verbose=1
)

model.fit(X_train, Y_train)

      Iter       Train Loss   Remaining Time 
         1 43210840276.9824           43.16m
         2 42464712252.5725           42.91m
         3 41733775887.2636           42.87m
         4 41016933537.9095           42.95m
         5 40314613416.8638           42.35m
         6 39625203127.0184           41.77m
         7 38950240022.3574           41.10m
         8 38287169225.5720           40.49m
         9 37630549030.6244           40.05m
        10 36993138898.2367           39.52m
        20 31186892825.2647           34.86m
        30 26341248684.7752           30.05m
        40 22351834403.2668           25.51m
        50 19065271835.5012           21.23m
        60 16348122345.4145           16.96m
        70 14108537493.4934           12.69m
        80 12255249938.5103            8.44m
        90 10727561311.2768            4.22m
       100  9474352049.3843            0.00s


In [10]:
model.score(X_test, Y_test)

0.7824487254091943

In [12]:
joblib.dump(model, './model/model.pkl')
loaded_model = joblib.load("./model/model.pkl")

print(f'R^2 score: {r2_score(Y_test, loaded_model.predict(X_test)):.6f}')

R^2 score: 0.782449
