In [40]:
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import ElasticNet 

In [None]:
df = pd.read_csv('../data/raw/OnlineNewsPopularity.csv')
print(df.shape)
df.describeribe()

In [None]:
df_cleaned = df.copy()
df_cleaned.drop('url', axis=1, inplace=True)
df_cleaned.columns = df_cleaned.columns.str.strip()
cor = df_cleaned.corr()
target = df_cleaned.pop('shares')

In [None]:
order = cor['shares'].map(lambda x : x).abs().sort_values(ascending = False)
selection = order[order > 0.02].index[1:] # exclude column 'shares'
selection

In [None]:
df_strong_features = df_cleaned[selection]
cor_selected = df_strong_features.corr()
cor_selected[cor_selected > 0.5]

In [None]:
cor_selected.columns 

In [None]:
independent_cols = ['kw_avg_avg', 'LDA_03', 'LDA_02',
       'self_reference_min_shares', 'self_reference_max_shares', 'num_hrefs', 
       'kw_avg_max', 'kw_min_avg', 'num_imgs', 'avg_negative_polarity', 'global_subjectivity', 
        'kw_max_min','abs_title_sentiment_polarity', 'num_videos', 'num_keywords']
df_independent_features = df_strong_features[independent_cols]
df_independent_features.shape

In [34]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_independent_features)
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [37]:
X_data, X_test, y_data, y_test = train_test_split (df_independent_features, target, test_size=0.2, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=8)
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/X_test',  X_test)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)
np.save('../data/processed/y_test',  y_test)

In [39]:
y_mean = y_train.mean()
y_base = np.full((len(y_train), 1), y_mean)
print(y_mean)
print(y_base)
print(mse(y_train, y_base, squared=False))
print(mae(y_train, y_base))

3393.0807189027273
[[3393.0807189]
 [3393.0807189]
 [3393.0807189]
 ...
 [3393.0807189]
 [3393.0807189]
 [3393.0807189]]
11661.716125848263
3209.78959340725


In [41]:
reg = ElasticNet()
reg.fit(X_train, y_train)
y_train_preds = reg.predict(X_train)
y_val_preds = reg.predict(X_val)
print(mse(y_val, y_val_preds, squared=False))
print(mae(y_val, y_val_preds))

8543.122344494523
2891.1692077224748


In [44]:
dump(reg,  '../models/elasticnet_default.joblib')

['../models/elasticnet_default.joblib']