# Entry 11 notebook - Consolidate Pre-processing - Online News Popularity

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce

### Custom functions

In [2]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

def define_reserve(X_test, y_test):
    X_test, X_reserve, y_test, y_reserve = train_test_split(X_test, y_test, test_size = 0.5, random_state=12)
    return X_test, X_reserve, y_test, y_reserve

In [3]:
def feature_corr_coll(train_df, target, test_df, corr_type='spearman'):
    target_corr = pd.DataFrame(train_df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = train_df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    
    collinear_features = set()
    for i in range(len(feature_corr.columns)):
        sliced_matrix = feature_corr.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = feature_corr.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    print(collinear_features)
    df_train = df_top.drop(collinear_features, axis=1)
    
    select_features = df_train.columns.tolist()
    df_test = test_df[select_features]
    
    return df_train, df_test

### Broken function

The above function only works if there are absolute spearman correlations over 0.5. This can be seen in the following code.

In [4]:
raw_df = pd.read_csv('../data/OnlineNewsPopularity.csv')
df = raw_df.drop(['url', ' timedelta'], axis=1)
corr_type='spearman'

X_train, X_test, y_train, y_test = split_data(df, ' shares', 0.6)
train_df = X_train
X_train.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
28664,10.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3,0.5,0.3
22537,15.0,878.0,0.456446,1.0,0.646245,5.0,2.0,2.0,0.0,4.895216,...,0.360095,0.1,1.0,-0.268576,-0.6,-0.1,0.0,0.0,0.5,0.0
2557,8.0,275.0,0.636029,1.0,0.813665,3.0,1.0,0.0,0.0,4.603636,...,0.370667,0.1,0.85,-0.275,-0.3,-0.25,0.0,0.0,0.5,0.0
25030,10.0,319.0,0.591195,1.0,0.734694,5.0,3.0,6.0,2.0,4.545455,...,0.360606,0.1,1.0,-0.321478,-0.5,-0.166667,0.4,-0.4,0.1,0.4
6636,10.0,217.0,0.599034,1.0,0.738462,3.0,2.0,1.0,0.0,4.78341,...,0.435714,0.1,0.6,-0.0375,-0.05,-0.025,0.0,0.0,0.5,0.0


In [5]:
y_train

Unnamed: 0,shares
28664,4100
22537,1100
2557,5100
25030,907
6636,1100
...,...
9475,2100
36482,1200
19709,1100
38555,811


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15857 entries, 28664 to 14155
Data columns (total 58 columns):
 n_tokens_title                   15857 non-null float64
 n_tokens_content                 15857 non-null float64
 n_unique_tokens                  15857 non-null float64
 n_non_stop_words                 15857 non-null float64
 n_non_stop_unique_tokens         15857 non-null float64
 num_hrefs                        15857 non-null float64
 num_self_hrefs                   15857 non-null float64
 num_imgs                         15857 non-null float64
 num_videos                       15857 non-null float64
 average_token_length             15857 non-null float64
 num_keywords                     15857 non-null float64
 data_channel_is_lifestyle        15857 non-null float64
 data_channel_is_entertainment    15857 non-null float64
 data_channel_is_bus              15857 non-null float64
 data_channel_is_socmed           15857 non-null float64
 data_channel_is_tech           

In [7]:
target = y_train
target_corr = pd.DataFrame(train_df.corrwith(target[' shares'], axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
target_corr['abs'] = target_corr[corr_type].abs()
top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
df_top = train_df[top_features]
feature_corr = df_top.corr(method=corr_type)

target_corr.head()

Unnamed: 0,index,spearman,abs
0,n_tokens_title,-0.043478,0.043478
1,n_tokens_content,0.012237,0.012237
2,n_unique_tokens,-0.047743,0.047743
3,n_non_stop_words,0.012353,0.012353
4,n_non_stop_unique_tokens,-0.070412,0.070412


In [8]:
target_corr.sort_values('abs', ascending=False)

Unnamed: 0,index,spearman,abs
25,kw_avg_avg,0.25042,0.25042
24,kw_max_avg,0.215194,0.215194
28,self_reference_avg_sharess,0.185206,0.185206
26,self_reference_min_shares,0.176022,0.176022
16,data_channel_is_world,-0.172506,0.172506
39,LDA_02,-0.162267,0.162267
27,self_reference_max_shares,0.162251,0.162251
36,is_weekend,0.150291,0.150291
12,data_channel_is_entertainment,-0.113471,0.113471
35,weekday_is_sunday,0.109095,0.109095


In [13]:
X_train_f, X_test_f = feature_corr_coll(X_train, y_train[' shares'], X_test)
X_train_f.head()

[]


28664
22537
2557
25030
6636


In [9]:
def preprocess_data(train_df, test_df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = train_df.index.tolist()
    test_index = test_df.index.tolist()
    
    num_features = train_df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(train_df[num_features])
    train_num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    test_num_scale = scaler.transform(test_df[num_features])
    test_num_df = pd.DataFrame(test_num_scale, columns=num_features, index=test_index)
    
    cat_features = train_df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(train_df[cat_features])
    train_cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    test_cat_encode = cat_encoder.transform(test_df[cat_features])
    test_cat_df = pd.DataFrame(test_cat_encode, columns=cat_features, index=test_index)
    
    train_df = pd.concat([train_num_df, train_cat_df], axis=1).fillna(-1)
    test_df = pd.concat([test_num_df, test_cat_df], axis=1).fillna(-1)
    
    return train_df, test_df

In [10]:
def train_and_predict(X_train, y_train, X_test, model=LinearRegression()):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return model, preds

# Get the Data

## Data

[Online News Popularity](http://archive.ics.uci.edu/ml/datasets/Computer+Hardware) from UCI Machine Learning Repository.

Number of observations: 39,797

Number of Attributes (all integers): 61

0. url: URL of the article (non-predictive) 
1. timedelta: Days between the article publication and the dataset acquisition (non-predictive) 
2. n_tokens_title: Number of words in the title 
3. n_tokens_content: Number of words in the content 
4. n_unique_tokens: Rate of unique words in the content 
5. n_non_stop_words: Rate of non-stop words in the content 
6. n_non_stop_unique_tokens: Rate of unique non-stop words in the content 
7. num_hrefs: Number of links 
8. num_self_hrefs: Number of links to other articles published by Mashable 
9. num_imgs: Number of images 
10. num_videos: Number of videos 
11. average_token_length: Average length of the words in the content 
12. num_keywords: Number of keywords in the metadata 
13. data_channel_is_lifestyle: Is data channel 'Lifestyle'? 
14. data_channel_is_entertainment: Is data channel 'Entertainment'? 
15. data_channel_is_bus: Is data channel 'Business'? 
16. data_channel_is_socmed: Is data channel 'Social Media'? 
17. data_channel_is_tech: Is data channel 'Tech'? 
18. data_channel_is_world: Is data channel 'World'? 
19. kw_min_min: Worst keyword (min. shares) 
20. kw_max_min: Worst keyword (max. shares) 
21. kw_avg_min: Worst keyword (avg. shares) 
22. kw_min_max: Best keyword (min. shares) 
23. kw_max_max: Best keyword (max. shares) 
24. kw_avg_max: Best keyword (avg. shares) 
25. kw_min_avg: Avg. keyword (min. shares) 
26. kw_max_avg: Avg. keyword (max. shares) 
27. kw_avg_avg: Avg. keyword (avg. shares) 
28. self_reference_min_shares: Min. shares of referenced articles in Mashable 
29. self_reference_max_shares: Max. shares of referenced articles in Mashable 
30. self_reference_avg_sharess: Avg. shares of referenced articles in Mashable 
31. weekday_is_monday: Was the article published on a Monday? 
32. weekday_is_tuesday: Was the article published on a Tuesday? 
33. weekday_is_wednesday: Was the article published on a Wednesday? 
34. weekday_is_thursday: Was the article published on a Thursday? 
35. weekday_is_friday: Was the article published on a Friday? 
36. weekday_is_saturday: Was the article published on a Saturday? 
37. weekday_is_sunday: Was the article published on a Sunday? 
38. is_weekend: Was the article published on the weekend? 
39. LDA_00: Closeness to LDA topic 0 
40. LDA_01: Closeness to LDA topic 1 
41. LDA_02: Closeness to LDA topic 2 
42. LDA_03: Closeness to LDA topic 3 
43. LDA_04: Closeness to LDA topic 4 
44. global_subjectivity: Text subjectivity 
45. global_sentiment_polarity: Text sentiment polarity 
46. global_rate_positive_words: Rate of positive words in the content 
47. global_rate_negative_words: Rate of negative words in the content 
48. rate_positive_words: Rate of positive words among non-neutral tokens 
49. rate_negative_words: Rate of negative words among non-neutral tokens 
50. avg_positive_polarity: Avg. polarity of positive words 
51. min_positive_polarity: Min. polarity of positive words 
52. max_positive_polarity: Max. polarity of positive words 
53. avg_negative_polarity: Avg. polarity of negative words 
54. min_negative_polarity: Min. polarity of negative words 
55. max_negative_polarity: Max. polarity of negative words 
56. title_subjectivity: Title subjectivity 
57. title_sentiment_polarity: Title polarity 
58. abs_title_subjectivity: Absolute subjectivity level 
59. abs_title_sentiment_polarity: Absolute polarity level 
60. shares: Number of shares (target)

In [5]:
raw_df = pd.read_csv('../data/OnlineNewsPopularity.csv')
raw_df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


### Drop features that are marked as non-predictive, then run process

In [12]:
df = raw_df.drop(['url', ' timedelta'], axis=1)

X_train, X_test, y_train, y_test = split_data(df, ' shares', 0.6)
# X_train, X_test = feature_corr_coll(X_train, y_train[' shares'], X_test)
X_train, X_test = preprocess_data(X_train, X_test)
X_test, X_reserve, y_test, y_reserve = define_reserve(X_test, y_test)
model, preds = train_and_predict(X_train, y_train, X_test)
preds[:5]

array([[2138.73223702],
       [4997.43738997],
       [2873.69405981],
       [1376.3593833 ],
       [2784.37940822]])