In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

pd.set_option("max.columns", 131)

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

from lightgbm import LGBMClassifier

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("df_model3.csv", index_col=0).dropna(subset=["y"])
df.head()

Unnamed: 0,title,upload_date,view_count,tempo_desde_pub,y
318,Mindset /w Killa Keim (From DownTown) | BBOY.O...,2021-03-11,4728,201.0,0.0
319,BBOY ZOOPREME | MUSIC IN HIS SOUL 🎶,2021-09-22,1356,6.0,1.0
323,BBOY ZOOTY ZOOT | AMAZING FLOW 🌊,2021-09-14,2902,14.0,1.0
326,Bboy music 2021- run it,2021-10-02,43,-4.0,0.0
327,Red Bull BC One B-Boy Cypher Japan 2021 | LIVE...,2021-09-26,50041,2.0,1.0


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title'] = df['title']


## 1. Limpeza da data

In [5]:

df_limpo['date'] = pd.to_datetime(df['upload_date'])

## 2. Limpeza de Views

In [6]:
df_limpo['views'] = df['view_count']

## 3. Features

In [7]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

In [8]:
features['tempo_desde_pub'] = (pd.to_datetime("2019-12-03") - df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']
features = features.drop(['tempo_desde_pub'], axis=1)

In [9]:
features.head()

Unnamed: 0,views,views_por_dia
318,4728,-10.189655
319,1356,-2.057663
323,2902,-4.457757
326,43,-0.064275
327,50041,-75.476621


In [10]:
base = "2021-09-27"
mask_train = (df_limpo['date'] < base)

mask_val = (df_limpo['date'] >= base)

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((447, 2), (102, 2), (447,), (102,))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


In [12]:
title_bow_train.shape

(447, 961)

In [13]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [14]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((447, 963), (102, 963))

# 4 RF

In [15]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
mdl_rf.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [16]:
p_rf = mdl_rf.predict_proba(Xval_wtitle)[:, 1]

In [17]:
average_precision_score(yval, p_rf), roc_auc_score(yval, p_rf)

(0.3849717481537236, 0.7400568181818182)

# 5 LGBM

In [18]:
params = [0.0040495256441752305,
 6,
 2,
 0.9181734790929957,
 0.44546924957870876,
 931,
 2,
 3]

lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])

title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(Xtrain_wtitle, ytrain)

p_lgbm = mdl_lgbm.predict_proba(Xval_wtitle)[:, 1]






In [19]:
average_precision_score(yval, p_lgbm), roc_auc_score(yval, p_lgbm)

(0.3525916501169773, 0.7482954545454545)

# 7 Logistic Reg

In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

#scaler = StandardScaler()
#scaler = MaxAbsScaler()

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2',n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, ytrain)

Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.5, n_jobs=6, random_state=0))])

In [22]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

In [23]:
average_precision_score(yval, p_lr), roc_auc_score(yval, p_lr)

(0.45126792035074414, 0.7338068181818181)

# 8 Ensemble

(0.22228951304206077, 0.6914990859232175) RF  
(0.23779186526938, 0.6883293035324645) LGBM  
(0.2124987281512838, 0.6808987438815827) LR  

(0.247808743128664, 0.6717874624049065) LGBM ngram 1,3

In [24]:
p = (p_lr + p_rf + p_lgbm)/3
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.37496533286523115, 0.7525568181818182)

In [25]:
pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.891282,0.856458
RF,0.891282,1.0,0.930603
LGBM,0.856458,0.930603,1.0


In [26]:
weights=[0.2,0.3,0.5,0.6,0.7,0.8]

for weight in weights:
    print(weight,end=' ')
    p = weight*p_rf + (1-weight)*p_lgbm
    
    print(average_precision_score(yval, p), roc_auc_score(yval, p))

0.2 0.34716893755505385 0.7460227272727273
0.3 0.35479812408534467 0.7488636363636363
0.5 0.3532281640548225 0.74375
0.6 0.3540943565829663 0.74375
0.7 0.35944819482143664 0.7471590909090908
0.8 0.37674454058754936 0.7477272727272728


# 9 Salvar modelos

In [27]:
import joblib as jb

In [29]:
import sklearn

sklearn.__version__

'0.24.2'

In [28]:
jb.dump(mdl_lgbm, "lgbm_20200208.pkl.z")
jb.dump(mdl_rf, "random_forest_20200208.pkl.z")
#jb.dump(lr_pipeline, "logistic_reg_20200208.pkl.z")
jb.dump(title_vec, "title_vectorizer_20200208.pkl.z")

['title_vectorizer_20200208.pkl.z']