 Setting up Workbook to run 

In [None]:
import sys, os

In [None]:
import configparser
import subprocess
import warnings
import pprint

In [None]:
import math
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error

or reproducibility

In [None]:
random.seed(540)

oading the Dataset

In [None]:
from config.definitions import ROOT_DIR
path_to_data = os.path.join(ROOT_DIR, 'data', 'OnlineNewsPopularity.csv')

In [156]:
df = pd.read_csv(path_to_data)
df['id'] = df.index + 1 

In [157]:
first_column = df.pop('id')
df.insert(0,'id', first_column)

In [None]:
print(df.head(3))

DA and Visualizations

In [None]:
df.info()

In [None]:
df.describe()
warnings.filterwarnings('ignore')

In [None]:
df.hist(figsize=(20,20))
plt.show()

In [158]:
df.columns=df.columns.str.replace(" ","")

In [159]:
df = df.drop('url',axis=1)

In [None]:
cor=df.corr()
#ns.heatmap(cor)
plt.figure(figsize=(15,15))
df_lt = cor.where(np.tril(np.ones(cor.shape)).astype(bool))
sns.heatmap(df_lt,cmap='BrBG')

In [None]:
num_of_nowords=df[df['n_tokens_content']==0].index
print('number of news with no words',num_of_nowords.size)

hortening weekday and channel column header titles

In [160]:
df.columns = df.columns.str.replace('weekday_is_', '')
df.columns = df.columns.str.replace('data_channel_is_', '')

In [None]:
df.info()

In [161]:
df = df[df['n_tokens_content'] != 0]

ropping columns that are not valuable for the analysis

In [162]:
df = df.drop('timedelta',axis=1)
df= df.drop(["n_non_stop_unique_tokens","n_non_stop_words","kw_avg_min"],axis=1)

In [None]:
df['shares'].describe()

etermine the appropriate threshold for number of shares for feature engineering.

In [None]:
df['shares'].median()

In [163]:
df['popularity'] = df['shares'].apply(lambda x: 0 if x <1400 else 1)

In [None]:
plt.figure(figsize=(10,5))
ax = sns.scatterplot(y='shares', x='n_tokens_content', data=df)

In [None]:
a,b = df['shares'].mean(),df['shares'].median()

In [None]:
weekday = df.columns.values[27:34]
weekday

isual of popular vs. unpopular news across the week

In [None]:
Unpop=df[df['shares']<b]
Pop=df[df['shares']>=b]
Unpop_day = Unpop[weekday].sum().values
Pop_day = Pop[weekday].sum().values

In [None]:
fig = plt.figure(figsize = (13,5))
plt.title("Count of popular vs unpopular news over different days of the week", fontsize = 16)

In [None]:
plt.bar(np.arange(len(weekday)),Pop_day,width=0.3,align='center',color='b',label='Popular')
plt.bar(np.arange(len(weekday))-0.3,Unpop_day,width=0.3,align='center',color='y',label='Unpopular')

In [None]:
plt.xticks(np.arange(len(weekday)),weekday)
plt.ylabel('Count',fontsize=15)
plt.xlabel('Days of the Week',fontsize=17)

In [None]:
plt.legend(loc = 'upper right')
plt.tight_layout()
plt.show()

In [None]:
channel=df.columns[11:16]
channel

ount of popular vs. unpopular news over different channels

In [None]:
Unpop2=df[df['shares']<b]
Pop2=df[df['shares']>=b]
Unpop_day2 = Unpop2[channel].sum().values
Pop_day2 = Pop2[channel].sum().values
fig = plt.figure(figsize = (13,5))
plt.title("Count of popular vs unpopular news over different channels", fontsize = 16)
plt.bar(np.arange(len(channel)), Pop_day2, width = 0.3, align="center", color = 'r', \
          label = "popular")
plt.bar(np.arange(len(channel)) - 0.3, Unpop_day2, width = 0.3, align = "center", color = 'g', \
          label = "unpopular")
plt.xticks(np.arange(len(channel)),channel)
plt.ylabel("Count", fontsize = 12)
plt.xlabel("Channel", fontsize = 12)
    
plt.legend(loc = 'upper right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15,5))
plt.plot(df.shares,color='g')
plt.xlabel('Records')
plt.ylabel('No of Shares')
plt.title('Shares Plot')
plt.show()

In [None]:
plt.figure(figsize=(13,5))
df.shares.hist(bins=50)

reating subplots for outliers

In [164]:
num_cols = df.select_dtypes(['int64','float64']).columns
num_plots = len(num_cols)
num_rows = math.ceil(num_plots / 6)  
num_cols_subplot = 6

In [None]:
fig, axes = plt.subplots(num_rows, num_cols_subplot, figsize=(15, 5 * num_rows))
axes = axes.flatten()

utliers

In [None]:
for i in range(len(num_cols)):
    sns.boxplot(df[num_cols[i]])
    plt.show()

In [None]:
for column in num_cols:    
    q1 = df[column].quantile(0.25)    # First Quartile
    q3 = df[column].quantile(0.75)    # Third Quartile
    IQR = q3 - q1                            # Inter Quartile Range
    llimit = q1 - 1.5*IQR                       # Lower Limit
    ulimit = q3 + 1.5*IQR                        # Upper Limit
    outliers = df[(df[column] < llimit) | (df[column] > ulimit)]
    print('Number of outliers in "' + column + '" : ' + str(len(outliers)))
    print(llimit)
    print(ulimit)
    print(IQR)

caling the dataset df2_num contains the numerical features while df2_cat contains the categorical features.

In [165]:
df2_num=df.drop(["monday","tuesday","wednesday","thursday",
                  "friday","saturday","sunday","is_weekend",                  
                  "lifestyle","entertainment","bus",
                  "socmed","tech","world"],axis=1)

In [166]:
df2_cat=df[["monday","tuesday","wednesday","thursday",
             "friday","saturday","sunday","is_weekend",            
             "lifestyle","entertainment","bus",
                  "socmed","tech","world"]]

rop the target variable from df2_num

In [167]:
df2_num = df2_num.drop(['shares','id'],axis=1)
#I think we should be dropping id col as well. Doesn't make sense to transform

In [None]:
df2_num.columns

Finding negative values

egcols=df2_num.columns[(df2_num<=0).any()]<br>
egcols

onverting negative values to positive values

or i in negcols:

In [None]:
 #   m=df2_num[i].min()
  #  name=i +'_new'
   # df2_num[name]=((df2_num[i]+1)-m)

f2_num.columns

or i in negcols:

In [None]:
 #   df2_num.drop(i,axis=1,inplace=True)

egcols=df2_num.columns[(df2_num<=0).any()]<br>
egcols

In [168]:
pt=preprocessing.PowerTransformer(method='yeo-johnson',standardize=False)
df2_num_add=pt.fit_transform(df2_num)
df2_num_add=(pd.DataFrame(df2_num_add,columns=df2_num.columns))
df2_num_add.shape

(38463, 42)

Treating outliers by capping values to a predefined range

In [None]:
for col in df2_num_add.columns:
    percentiles = df2_num_add[col].quantile([0.01,0.99]).values
    df2_num_add[col][df2_num_add[col] <= percentiles[0]] = percentiles[0]
    df2_num_add[col][df2_num_add[col] >= percentiles[1]] = percentiles[1]

In [None]:
num_cols = df2_num_add.select_dtypes(['int64','float64']).columns

In [None]:
for column in num_cols:    
    q1 = df2_num_add[column].quantile(0.25)    # First Quartile
    q3 = df2_num_add[column].quantile(0.75)    # Third Quartile
    IQR = q3 - q1                            # Inter Quartile Range
    llimit = q1 - 1.5*IQR                       # Lower Limit
    ulimit = q3 + 1.5*IQR                        # Upper Limit
    outliers = df2_num_add[(df2_num_add[column] < llimit) | (df2_num_add[column] > ulimit)]
    print('Number of outliers in "' + column + '" : ' + str(len(outliers)))
    print(llimit)
    print(ulimit)
    print(IQR)

creating subplots for transformation

In [None]:
num_cols = df2_num_add.select_dtypes(['int64','float64']).columns
num_plots = len(num_cols)
num_rows = math.ceil(num_plots / 13)  # You can adjust the number of columns as per your preference
num_cols_subplot = 5

In [None]:
fig, axes = plt.subplots(num_rows, num_cols_subplot, figsize=(15, 5 * num_rows))
axes = axes.flatten()

oxplot transformation

In [None]:
for i in range(len(num_cols)):
    sns.boxplot(df2_num_add[num_cols[i]])
    plt.show()

In [None]:
df2_num_add.columns

In [None]:
df2_cat.columns

Create a 'top_data_channel' column based on the data_channel columns

In [None]:
df2_cat['top_data_channel'] = df2_cat[['lifestyle', 'entertainment',
                             'bus', 'socmed',
                             'tech', 'world']].idxmax(axis=1)
#What are we using this for? Some entries in the df have 0 for all channel cols.. And will return lifestlye bc it is the first
# occurance of the max, which happens to be 0

Print the unique values in the 'top_data_channel' column

In [None]:
unique_top_data_channels = df2_cat['top_data_channel'].unique()
print(f"Unique values in 'top_data_channel': {unique_top_data_channels}\n")

Define a function to extract the last word from a string

In [None]:
def extract_last_word(channel):
    words = channel.split('_')
    return words[-1]

Apply the function to the 'top_data_channel' column and create a new 'top_data_channel_last_word' column

In [None]:
df2_cat['top_data_channel_last_word'] = df2_cat['top_data_channel'].apply(extract_last_word)

Print the unique values in the 'top_data_channel_last_word' column

In [None]:
unique_last_words = df2_cat['top_data_channel_last_word'].unique()
print(f"Unique last words in 'top_data_channel_last_word': {unique_last_words}\n")
df2_cat = df2_cat.drop('top_data_channel',axis=1)

In [None]:
df_final=pd.concat([df2_num_add,df2_cat],axis=1)

In [None]:
df_final.shape

In [None]:
df_final['popularity'] = df['shares'].apply(lambda x: 0 if x <1400 else 1)

In [None]:
df_final.isnull().sum()

In [None]:
df_final=df_final.dropna()

In [None]:
df_final.columns
print(df_final['popularity'])
df_final.shape

In [171]:
df_RF = df.drop('id',axis =1)
X = df_RF.drop('shares', axis =1)
y = df_RF['shares'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=2)

rf = RandomForestRegressor(n_estimators=100,
                           max_depth=15,
                           min_samples_leaf=df.shape[1],
                           max_features=0.25,
                           n_jobs=-1)

In [172]:
print("---Training Begin---")
start_time = datetime.now()
rf.fit(X_train, y_train)
print("---Training Done---")
print("Execution Time: ", datetime.now() - start_time)

preds = rf.predict(X_test)
warnings.filterwarnings("ignore")

print('Mean Absolute Error (MAE):', mean_absolute_error(y_test, preds))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, preds))
print('Root Mean Squared Error (RMSE):', np.sqrt(mean_squared_error(y_test, preds)))
mape = np.mean(np.abs((y_test - preds) / np.abs(y_test)))
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
print('Accuracy:', round(100*(1 - mape), 2))

---Training Begin---


---Training Done---
Execution Time:  0:00:35.683597
Mean Absolute Error (MAE): 2398.056772592973
Mean Squared Error (MSE): 102181148.3796637
Root Mean Squared Error (RMSE): 10108.469141252977
Mean Absolute Percentage Error (MAPE): 84.86
Accuracy: 15.14
