In [125]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

from typing import Dict, Any
from xgboost import XGBRegressor

In [126]:
test = pd.read_parquet('test')
train = pd.read_parquet('train')

In [127]:
model = XGBRegressor()
one_hot_encoder = OneHotEncoder()
multilabel_binarizer = MultiLabelBinarizer()
count_vectorizer = CountVectorizer(analyzer='word', stop_words='english', max_features=10)

In [128]:
X_FEATURES = [
    'timedelta', 'n_tokens_title', 'n_tokens_content',
    'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
    'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
    'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
    'data_channel_is_entertainment', 'data_channel_is_bus',
    'data_channel_is_socmed', 'data_channel_is_tech',
    'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
    'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
    'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
    'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
    'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
    'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
    'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
    'global_sentiment_polarity', 'global_rate_positive_words',
    'global_rate_negative_words', 'rate_positive_words',
    'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
    'max_positive_polarity', 'avg_negative_polarity',
    'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
    'title_sentiment_polarity', 'abs_title_subjectivity',
    'abs_title_sentiment_polarity', 'content', 'surprise1', 'surprise2',
]

Y_FEATURES = ['popular']

In [129]:
train = train.sort_values('timedelta', ascending=False)

In [130]:
x_train, x_valid, y_train, y_valid = train_test_split(train[X_FEATURES], train[Y_FEATURES], test_size=0.2, random_state=10)

def assert_proportion_of_dataset_partition_is(expected_proportion, dataset_to_check, complete_dataset):
    assert(round((len(dataset_to_check) / len(complete_dataset)), 1) == expected_proportion)

assert_proportion_of_dataset_partition_is(expected_proportion=.8, dataset_to_check=x_train, complete_dataset=train[X_FEATURES])
assert_proportion_of_dataset_partition_is(expected_proportion=.2, dataset_to_check=x_valid, complete_dataset=train[X_FEATURES])
assert_proportion_of_dataset_partition_is(expected_proportion=.8, dataset_to_check=y_train, complete_dataset=train[Y_FEATURES])
assert_proportion_of_dataset_partition_is(expected_proportion=.2, dataset_to_check=y_valid, complete_dataset=train[Y_FEATURES])

In [131]:
x_train = x_train.drop('timedelta', axis=1)
x_valid = x_valid.drop('timedelta', axis=1)

In [132]:
encoded_surprise1_train = one_hot_encoder.fit_transform(x_train[['surprise1']]).todense().astype(int)
encoded_surprise1_valid = one_hot_encoder.transform(x_valid[['surprise1']]).todense().astype(int)

x_train['surprise1'] = encoded_surprise1_train
x_valid['surprise1'] = encoded_surprise1_valid

In [133]:
x_train['surprise2'] = x_train['surprise2'].map(lambda x: 'unknown' if x == '' else x)
x_valid['surprise2'] = x_valid['surprise2'].map(lambda x: 'unknown' if x == '' else x)

encoded_surprise2_train = multilabel_binarizer.fit_transform(x_train['surprise2'])
encoded_surprise2_valid = multilabel_binarizer.transform(x_valid['surprise2'])

x_train['surprise2'] = encoded_surprise2_train
x_valid['surprise2'] = encoded_surprise2_valid

In [134]:
encoded_y_train = one_hot_encoder.fit_transform(y_train[['popular']]).todense().astype(int)
encoded_y_valid = one_hot_encoder.transform(y_valid[['popular']]).todense().astype(int)
y_train['popular'] = encoded_y_train
y_valid['popular'] = encoded_y_valid

In [135]:
x_train['content'] = x_train['content'].fillna('')
x_valid['content'] = x_valid['content'].fillna('')

vectorized_x_train_content = count_vectorizer.fit_transform(x_train['content'].tolist()).toarray()
x_train_content_word_count = pd.DataFrame(vectorized_x_train_content, columns=count_vectorizer.get_feature_names())

vectorized_x_valid_content = count_vectorizer.transform(x_valid['content'].tolist()).toarray()
x_valid_content_word_count = pd.DataFrame(vectorized_x_valid_content, columns=count_vectorizer.get_feature_names())

x_train['content'] = vectorized_x_train_content
x_valid['content'] = vectorized_x_valid_content

x_train = pd.concat([x_train.reset_index(), x_train_content_word_count.reset_index()], axis=1)
x_valid = pd.concat([x_valid.reset_index(), x_valid_content_word_count.reset_index()], axis=1)

In [136]:
x_train = x_train.fillna(0.0)
x_valid = x_valid.fillna(0.0)

In [137]:
x_train = x_train.drop('index', axis=1)
x_valid = x_valid.drop('index', axis=1)

In [138]:
model.fit(x_train, y_train['popular'])

In [139]:
roc_auc_score(y_valid['popular'], model.predict(x_valid))

0.6747864205106353

In [140]:
from catboost import CatBoostRegressor

model2 = CatBoostRegressor()

In [141]:
model2.fit(x_train, y_train['popular'])

Learning rate set to 0.069521
0:	learn: 0.3990703	total: 10.1ms	remaining: 10.1s
1:	learn: 0.3976640	total: 15.4ms	remaining: 7.68s
2:	learn: 0.3963300	total: 23.2ms	remaining: 7.72s
3:	learn: 0.3951063	total: 30.1ms	remaining: 7.5s
4:	learn: 0.3941327	total: 37ms	remaining: 7.37s
5:	learn: 0.3932164	total: 44.2ms	remaining: 7.32s
6:	learn: 0.3923380	total: 51.9ms	remaining: 7.36s
7:	learn: 0.3915546	total: 57.8ms	remaining: 7.17s
8:	learn: 0.3909172	total: 63.2ms	remaining: 6.96s
9:	learn: 0.3902469	total: 145ms	remaining: 14.3s
10:	learn: 0.3895681	total: 156ms	remaining: 14s
11:	learn: 0.3890479	total: 168ms	remaining: 13.8s
12:	learn: 0.3885411	total: 175ms	remaining: 13.3s
13:	learn: 0.3881216	total: 180ms	remaining: 12.7s
14:	learn: 0.3876741	total: 185ms	remaining: 12.2s
15:	learn: 0.3872665	total: 192ms	remaining: 11.8s
16:	learn: 0.3868642	total: 198ms	remaining: 11.5s
17:	learn: 0.3865822	total: 205ms	remaining: 11.2s
18:	learn: 0.3862585	total: 210ms	remaining: 10.8s
19:	lea

<catboost.core.CatBoostRegressor at 0x151e06910>

In [142]:
roc_auc_score(y_valid['popular'], model2.predict(x_valid))

0.7106264536258837

# Búsqueda de Hiperparámetros

In [143]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## GridSearchCV

In [144]:
hyperparameter_grid = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'max_depth': [2, 3, 5, 10, 15],
    'learning_rate': [0.05, 0.1, 0.15, 0.20],
    'min_child_weight': [1, 2, 3, 4]
}

clf = GridSearchCV(
    estimator=model, 
    param_grid=hyperparameter_grid,
    scoring='neg_mean_squared_error', 
    verbose=1
)
clf.fit(x_train, y_train)

print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


KeyboardInterrupt: 

## RandomizedSearchCV