In [81]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report

from typing import Dict, Any

In [82]:
test = pd.read_parquet('test')
train = pd.read_parquet('train')

In [83]:
model = LogisticRegression()

In [84]:
complete_dataset_length = len(train) + len(test)
train_proportion = len(train) / complete_dataset_length
test_proportion = len(test) / complete_dataset_length

train_proportion, test_proportion 

(0.8995812733326607, 0.10041872666733932)

In [85]:
train = train.sort_values('timedelta', ascending=False).dropna()

In [86]:
X_FEATURES = [
    'timedelta', 'n_tokens_title', 'n_tokens_content',
    'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
    'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
    'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
    'data_channel_is_entertainment', 'data_channel_is_bus',
    'data_channel_is_socmed', 'data_channel_is_tech',
    'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
    'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
    'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
    'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
    'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
    'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
    'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
    'global_sentiment_polarity', 'global_rate_positive_words',
    'global_rate_negative_words', 'rate_positive_words',
    'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity',
    'max_positive_polarity', 'avg_negative_polarity',
    'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity',
    'title_sentiment_polarity', 'abs_title_subjectivity',
    'abs_title_sentiment_polarity', 'content', 'surprise1', 'surprise2',
]

Y_FEATURES = ['popular']

In [87]:
x_test = test[X_FEATURES]
y_test = test[Y_FEATURES]

In [88]:
x_train, x_valid, y_train, y_valid = train_test_split(train[X_FEATURES], train[Y_FEATURES], test_size=0.2, random_state=42)

def assert_proportion_of_dataset_partition_is(expected_proportion, dataset_to_check, complete_dataset):
    assert(round((len(dataset_to_check) / len(complete_dataset)), 1) == expected_proportion)

assert_proportion_of_dataset_partition_is(expected_proportion=.8, dataset_to_check=x_train, complete_dataset=train[X_FEATURES])
assert_proportion_of_dataset_partition_is(expected_proportion=.2, dataset_to_check=x_valid, complete_dataset=train[X_FEATURES])
assert_proportion_of_dataset_partition_is(expected_proportion=.8, dataset_to_check=y_train, complete_dataset=train[Y_FEATURES])
assert_proportion_of_dataset_partition_is(expected_proportion=.2, dataset_to_check=y_valid, complete_dataset=train[Y_FEATURES])

In [89]:
def oneHotEncoding(dataframe: pd.DataFrame, feature: str, encodings: Dict[str, Any]):
    encodings[feature] = one_hot_encoder.fit_transform(dataframe[feature])

In [90]:
one_hot_encoder = OneHotEncoder()
encoded_surprise1 = one_hot_encoder.fit_transform(x_train[['surprise1']]).todense().astype(int)
encoded_surprise1_valid = one_hot_encoder.transform(x_valid[['surprise1']]).todense().astype(int)

In [91]:
x_train['surprise1'] = encoded_surprise1
x_valid['surprise1'] = encoded_surprise1_valid

In [92]:
x_train['surprise2'] = x_train['surprise2'].map(lambda x: 'unknown' if x == '' else x)
x_valid['surprise2'] = x_valid['surprise2'].map(lambda x: 'unknown' if x == '' else x)

In [93]:
x_train = x_train.join(x_train['surprise2'].str.get_dummies(',')).drop('surprise2', axis=1).drop('annoyance', axis=1)
x_valid = x_valid.join(x_valid['surprise2'].str.get_dummies(',')).drop('surprise2', axis=1)

In [94]:
encoded_y_train = one_hot_encoder.fit_transform(y_train[['popular']]).todense().astype(int)
encoded_y_valid = one_hot_encoder.transform(y_valid[['popular']]).todense().astype(int)

In [95]:
# y_train['popular'] = encoded_y_train
# y_valid['popular'] = encoded_y_valid

In [96]:
def total_characters_in_content(dataframe):
    return sum(dataframe['content'].map(lambda x: len(x) if x else 0))

In [97]:
total_train = total_characters_in_content(x_train)
x_train['content'] = x_train['content'].map(lambda x: len(x)/total_train if x else 0)

total_valid = total_characters_in_content(x_valid)
x_valid['content'] = x_valid['content'].map(lambda x: len(x)/total_valid if x else 0)

In [98]:
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [99]:
binary_predictions = model.predict(x_valid)
binary_predictions

array([False, False, False, ..., False, False, False])

In [100]:
print(classification_report(y_valid, binary_predictions))

              precision    recall  f1-score   support

       False       0.79      0.98      0.87      5622
        True       0.23      0.02      0.04      1495

    accuracy                           0.78      7117
   macro avg       0.51      0.50      0.46      7117
weighted avg       0.67      0.78      0.70      7117



In [101]:
probability_predictions = model.predict_proba(x_valid)[:,1]
probability_predictions

array([0.16436206, 0.1589858 , 0.20199375, ..., 0.20533415, 0.27888945,
       0.17507736])

In [102]:
roc_auc_score(y_valid, probability_predictions)

0.585462272557999