In [1]:
# Libraries to remove warnings
import warnings 
warnings.filterwarnings('ignore')

# Libraries for preprocessing, scaling and model evaluation
import re
import datetime as dt
import pandas as pd


In [2]:
# read dataset in dataframe
df_orig = pd.read_excel('../data/input/OnlineNewsPopularity.xlsx') 
df_orig.head(5)

Unnamed: 0,Id,url,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,shares
0,1,http://mashable.com/2014/09/01/americans-held-...,10,261,0.661355,1.0,0.7875,7,3,1,1,4.873563,7,0,0,0,0,0,1,1100
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,7,1791,0.370242,1.0,0.535038,74,3,50,0,4.554439,8,0,0,0,0,0,0,1100
2,3,http://mashable.com/2014/09/01/aussie-football...,7,503,0.524291,1.0,0.704918,3,3,1,0,5.003976,5,0,0,0,0,0,1,1000
3,4,http://mashable.com/2014/09/01/australia-gover...,10,526,0.536204,1.0,0.654867,17,1,1,0,4.998099,9,0,0,0,0,0,1,822
4,5,http://mashable.com/2014/09/01/australia-jane-...,13,237,0.619048,1.0,0.807143,5,3,1,0,5.046414,9,0,0,0,0,0,1,841


In [3]:
df_orig['date']=df_orig['url'].apply(lambda x: dt.datetime.strptime(re.search("\d{4}/\d{2}/\d{2}", x).group(), '%Y/%m/%d').date())

In [4]:
df_orig['date'] = pd.to_datetime(df_orig['date'])
df_orig['day_of_week'] = df_orig['date'].dt.day_name()
df_orig.head(5)

Unnamed: 0,Id,url,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,shares,date,day_of_week
0,1,http://mashable.com/2014/09/01/americans-held-...,10,261,0.661355,1.0,0.7875,7,3,1,...,7,0,0,0,0,0,1,1100,2014-09-01,Monday
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,7,1791,0.370242,1.0,0.535038,74,3,50,...,8,0,0,0,0,0,0,1100,2014-09-01,Monday
2,3,http://mashable.com/2014/09/01/aussie-football...,7,503,0.524291,1.0,0.704918,3,3,1,...,5,0,0,0,0,0,1,1000,2014-09-01,Monday
3,4,http://mashable.com/2014/09/01/australia-gover...,10,526,0.536204,1.0,0.654867,17,1,1,...,9,0,0,0,0,0,1,822,2014-09-01,Monday
4,5,http://mashable.com/2014/09/01/australia-jane-...,13,237,0.619048,1.0,0.807143,5,3,1,...,9,0,0,0,0,0,1,841,2014-09-01,Monday


In [5]:
df_orig = pd.concat([df_orig, pd.get_dummies(df_orig['day_of_week'], prefix='is_')], axis=1)
df_orig.drop(['date', 'day_of_week'], axis = 1, inplace = True)
df_orig.head()

Unnamed: 0,Id,url,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,data_channel_is_tech,data_channel_is_world,shares,is__Friday,is__Monday,is__Saturday,is__Sunday,is__Thursday,is__Tuesday,is__Wednesday
0,1,http://mashable.com/2014/09/01/americans-held-...,10,261,0.661355,1.0,0.7875,7,3,1,...,0,1,1100,0,1,0,0,0,0,0
1,2,http://mashable.com/2014/09/01/apple-visa-mast...,7,1791,0.370242,1.0,0.535038,74,3,50,...,0,0,1100,0,1,0,0,0,0,0
2,3,http://mashable.com/2014/09/01/aussie-football...,7,503,0.524291,1.0,0.704918,3,3,1,...,0,1,1000,0,1,0,0,0,0,0
3,4,http://mashable.com/2014/09/01/australia-gover...,10,526,0.536204,1.0,0.654867,17,1,1,...,0,1,822,0,1,0,0,0,0,0
4,5,http://mashable.com/2014/09/01/australia-jane-...,13,237,0.619048,1.0,0.807143,5,3,1,...,0,1,841,0,1,0,0,0,0,0


In [6]:
def weekend(row):
    if (row['is__Saturday'] == 1) | (row['is__Sunday'] == 1):
        return 1
    else:
        return 0

df_orig['is_Weekend'] = df_orig.apply(weekend, axis=1) 

# Display only weekend == 1 rows
df_orig[df_orig['is_Weekend'] == 1].head(100)

Unnamed: 0,Id,url,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,data_channel_is_world,shares,is__Friday,is__Monday,is__Saturday,is__Sunday,is__Thursday,is__Tuesday,is__Wednesday,is_Weekend
364,365,http://mashable.com/2014/09/06/alternate-endin...,9,892,0.510181,1.0,0.667780,15,1,1,...,0,1000,0,0,1,0,0,0,0,1
365,366,http://mashable.com/2014/09/06/apple-iphone-6-...,11,99,0.704082,1.0,0.824561,7,7,0,...,0,3600,0,0,1,0,0,0,0,1
366,367,http://mashable.com/2014/09/06/apple-looks-to-...,14,1069,0.401328,1.0,0.599332,2,2,1,...,0,1100,0,0,1,0,0,0,0,1
367,368,http://mashable.com/2014/09/06/app-roundup-nfl...,9,1074,0.405328,1.0,0.601351,16,13,2,...,0,9300,0,0,1,0,0,0,0,1
368,369,http://mashable.com/2014/09/06/bill-murray-jak...,10,229,0.581498,1.0,0.786325,2,2,1,...,0,2600,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
877,878,http://mashable.com/2014/09/14/amsterdam-tomat...,9,417,0.550617,1.0,0.705882,5,4,1,...,0,1400,0,0,0,1,0,0,0,1
878,879,http://mashable.com/2014/09/14/amy-winehouse-l...,11,743,0.488435,1.0,0.672897,8,8,1,...,0,881,0,0,0,1,0,0,0,1
879,880,http://mashable.com/2014/09/14/animal-selfie-t...,13,387,0.551532,1.0,0.658120,3,1,1,...,0,2000,0,0,0,1,0,0,0,1
880,881,http://mashable.com/2014/09/14/anthropologie-diy/,13,294,0.585366,1.0,0.719101,8,5,1,...,0,2300,0,0,0,1,0,0,0,1


In [7]:
df_days = df_orig[ ['Id', 'is__Monday', 'is__Tuesday', 'is__Wednesday', 'is__Thursday', 'is__Friday', 'is__Saturday', 'is__Sunday', 'is_Weekend']]

In [12]:
df_days.head(400)

Unnamed: 0,Id,is__Monday,is__Tuesday,is__Wednesday,is__Thursday,is__Friday,is__Saturday,is__Sunday,is_Weekend
0,1,1,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0
3,4,1,0,0,0,0,0,0,0
4,5,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
395,396,0,0,0,0,0,1,0,1
396,397,0,0,0,0,0,0,1,1
397,398,0,0,0,0,0,0,1,1
398,399,0,0,0,0,0,0,1,1


In [15]:
df_days.to_excel('../data/output/2_Day_of_Week_Extraction.xlsx', index=False)