In [2]:
import pandas as pd
import requests as rq
import numpy as np
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup

In [None]:
pd.set_option("display.max_columns", None)

In [6]:
df = pd.read_csv("features_selecionadas(com labels-full).csv", index_col=0)
df = df[df["interested"].notnull()].reset_index(drop=True)
df.shape

(1650, 16)

In [29]:
df.duplicated().sum()

31

In [33]:
df = df.drop_duplicates("watch-title").reset_index(drop=True)
df.shape

(1573, 16)

In [34]:
df.head(2)

Unnamed: 0,watch-title,interested,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:type,channel_link_0
0,O que é Data Science? #HipstersPontoTube,1.0,63.761 visualizações,Publicado em 3 de abr. de 2019,Ciência e tecnologia,O que é Data Science? #HipstersPontoTube,Alura Cursos Online\n\n\n\n\n\n\n\n\n\n\n\n\n\...,63.761 visualizações\n\n\n\n\n\n\n\n6.837\n\nG...,https://i.ytimg.com/vi/5b9Z8toVaAU/maxresdefau...,1280,720,"Python é o novo Excel? Para que serve o R, Pan...",1280.0,720.0,text/html,/channel/UCo7EHzKF2zDFWszw7Dg4mPw
1,Data Science: Introdução a Ciência de Dados (P...,1.0,85.461 visualizações,Publicado em 23 de set. de 2019,Ciência e tecnologia,Data Science: Introdução a Ciência de Dados (P...,Filipe Deschamps\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,85.461 visualizações\n\n\n\n\n\n\n\n6.604\n\nG...,https://i.ytimg.com/vi/F608hzn_ygo/maxresdefau...,1280,720,✅ALURA COM 10% DE DESCONTO: https://www.alura....,1280.0,720.0,text/html,/channel/UCU5JicSrEM5A63jkJ2QvGYw


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [36]:
df_clean = pd.DataFrame(index=df.index)

In [37]:
df["watch-title"].head(3)

0             O que é Data Science? #HipstersPontoTube
1    Data Science: Introdução a Ciência de Dados (P...
2                  Como é ser data scientist no Nubank
Name: watch-title, dtype: object

In [38]:
df_clean["title"] = df["watch-title"]

## Limpeza de Datas

In [39]:
date_clean = df["watch-time-text"].str.extract(r"(\d+) de ([a-z]+)\. de (\d+)")
date_clean[0] = date_clean[0].apply(lambda x: "0" + str(x) if len(str(x)) == 1 else x)

months_replace = {"jan": "01",
                  "fev": "02",
                  "mar": "03",
                  "abr": "04",
                  "mai": "05",
                  "jun": "06",
                  "jul": "07",
                  "ago": "08",
                  "set": "09",
                  "out": "10",
                  "nov": "11",
                  "dez": "12"}

date_clean[1] = date_clean[1].replace(months_replace)
df_clean["date"] = pd.to_datetime(date_clean[0] + "/" + date_clean[1] + "/" + date_clean[2], format="%d/%m/%Y")

## Limpeza de Views

In [214]:
df["watch-view-count"].str.extract(r"(\d+\.?\d+)", expand=False).str.replace(".", "").fillna(0).astype(int)

0        63761
1        85461
2        57726
3       272406
4        10467
         ...  
1568        55
1569       541
1570    238395
1571    136141
1572      7270
Name: watch-view-count, Length: 1573, dtype: int32

In [40]:
df_clean["views"] = df["watch-view-count"].str.extract(r"(\d+\.?\d+)", expand=False).str.replace(".", "").fillna(0).astype(int)

In [41]:
df_clean.head()

Unnamed: 0,title,date,views
0,O que é Data Science? #HipstersPontoTube,2019-04-03,63761
1,Data Science: Introdução a Ciência de Dados (P...,2019-09-23,85461
2,Como é ser data scientist no Nubank,2019-01-16,57726
3,O que é ciência de dados | Nerdologia Tech,2018-07-26,272406
4,Como ingressar no MERCADO de DATA SCIENCE | Bi...,2019-10-21,10467


## Features

In [42]:
features = pd.DataFrame(index=df_clean.index)
y = df["interested"].copy()

In [43]:
features["tempo_desde_pub"] = (pd.to_datetime("2020-04-01") - df_clean["date"]) / np.timedelta64(1, 'D')
features["views"] = df_clean["views"]
features["views_por_dia"] = features["views"] / features["tempo_desde_pub"]
features = features.drop(["tempo_desde_pub"], axis=1)

In [44]:
features.head()

Unnamed: 0,views,views_por_dia
0,63761,175.167582
1,85461,447.439791
2,57726,130.897959
3,272406,442.936585
4,10467,64.214724


In [46]:
mask_train = (df_clean["date"] < "2018-10-01")
mask_val = df_clean["date"] >= "2018-10-01"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((798, 2), (767, 2), (798,), (767,))

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
title_train = df_clean[mask_train]["title"]
title_val = df_clean[mask_val]["title"]

In [184]:
title_vec = TfidfVectorizer(min_df=2, ngram_range=(1, 1))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [185]:
title_bow_train.shape

(798, 619)

In [186]:
title_bow_train

<798x619 sparse matrix of type '<class 'numpy.float64'>'
	with 5211 stored elements in Compressed Sparse Row format>

In [187]:
from scipy.sparse import hstack, vstack

In [188]:
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [189]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((798, 621), (767, 621))

In [190]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [191]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [192]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [193]:
# Antigo: 0.68, 0.7923196847300129, 0.7547768517352968
average_precision_score(yval, p)

0.6026857612699679

In [194]:
# Antigo: 0.63, 0.7243598428397237, 0.7123174113140667
roc_auc_score(yval, p)

0.7457445542197358

In [65]:
# avg: 0.6026857612699679, auc: 0.7457445542197358

In [None]:
# Mindf=2 se saiu melhor que mindf=1 e 3
ap 0.7558327673464535, auc 0.6818859233166237 - mindf=1
ap 0.7923196847300129, auc 0.7243598428397237 - mindf=2
ap 0.7837520730469235, auc 0.7177663369913743 - mindf=3

## LightGBM

In [196]:
from lightgbm import LGBMClassifier

In [198]:
mdl = LGBMClassifier(random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=100, n_jobs=6, num_leaves=31,
               objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [201]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [204]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.5405935778899976, 0.7078919176288537)

In [205]:
import joblib as jb

In [216]:
jb.dump(mdl, "lgbm_20200408.pkl.z")

['lgbm_20200408.pkl.z']

In [217]:
jb.dump(title_vec, "titlevec_20200408.pkl.z")

['titlevec_20200408.pkl.z']