In [100]:
#Import Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Importing Dependencies
import pandas as pd
import numpy as np
from numpy import nan

import csv
import nltk
import pickle

In [101]:
#Pre-Processing + Custom Class Integration
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin, BaseEstimator


#Model Imports
from sklearn.linear_model import LinearRegression, ElasticNetCV, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor

#Model Boosting
from sklearn.ensemble import BaggingRegressor

#Model Metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error


#GridSearch
from sklearn.model_selection import GridSearchCV

#Scipy Integration for Sparse Matrixes
from scipy import sparse

#Additional Feature Engineering - NLP Text Data Import
from sklearn.feature_extraction.text import TfidfVectorizer

#PipeLine Imports
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


#Multi-threaded Processing
from sklearn.externals import joblib

In [102]:
pd.set_option('display.max_columns', 500)

In [103]:
cleaned_df = pd.read_pickle('../Ignore/cleaned_data.pkl')
cleaned_df.dropna(axis= 0, inplace = True)

In [104]:
# cleaned_df.drop(axis=1, columns = ['Url', 'Published_Date', 'Setences_Text', 'Meta_Description', 'Title_Text' ], inplace=True)

In [105]:
# Cleaning The Web_Speed_Data
web_speed_data = pd.read_csv('../Google_Page_Speed_Insights/Data/pagespeed-results-merged.csv')
web_speed_data['First Contentful Paint'] = web_speed_data['First Contentful Paint'].apply(lambda x: float(x.replace('s', '')))
web_speed_data['First Interactive'] = web_speed_data['First Interactive'].apply(lambda x: float(x.replace('s', '')))
web_speed_data.columns = ['Url', 'First_Contentful_Paint', 'First_Interactive' ] 

In [93]:
cleaned_df.shape

(15743, 67)

In [92]:
web_speed_data.shape

(16115, 3)

In [94]:
df.shape

(14469, 62)

In [106]:
df = pd.merge(left=cleaned_df, right=web_speed_data, left_on='Url', right_on='Url' )

In [49]:
df.drop(axis=1, columns = ['Article_Text' ,'Url', 'Published_Date', 
                           'Setences_Text', 'Meta_Description', 'Title_Text' ], inplace=True)

In [50]:
df['Total_Shares'] = df['Total_Shares'].apply(lambda x: np.log1p(x))

In [51]:
X = df
y = df.pop('Total_Shares')

In [52]:
pipe_lasso = Pipeline([('sc1', StandardScaler()),
                   ('Model', LassoCV(alphas = np.linspace(0.01, 10, 50), n_jobs=1))])

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

In [56]:
pipe_lasso.fit(X_train, y_train)
pipe_lasso.score(X_train, y_train)



0.7009549817602769

In [57]:
pipe_lasso.score(X_test, y_test)

0.686617417856081

-----------------------------------------------------------------------------------------------------------------------------

In [58]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

In [59]:
RFR = RandomForestRegressor(n_estimators = 100)

In [60]:
RFR.fit(X_train, y_train)
RFR.score(X_train, y_train)

0.9806500609285872

In [61]:
RFR.score(X_test, y_test)

0.8576716523470397

In [62]:
np.mean(cross_val_score(RFR, X_train, y_train, cv=5))

0.8559860571616762

In [64]:
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [65]:
ada_regressor = AdaBoostRegressor(base_estimator= RandomForestRegressor(n_estimators=100),
                                  n_estimators = 10, loss='linear', learning_rate = 0.2)

In [66]:
ada_regressor.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True,
                                                       criterion='mse',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                                       n_jobs=None,
                                                       oob_score=False,
                       

In [67]:
ada_regressor.score(X_train, y_train)

0.9797394758496945

In [68]:
ada_regressor.score(X_test, y_test)

0.8582622482337301

In [69]:
np.mean(cross_val_score(ada_regressor, X_train, y_train, cv=5))

0.8568573273340506

In [70]:
def model_results_new(data, data_type, model, X_train_num, X_test_num, y_train, y_test):
    
    master_dict = {
    'Data_Used': [],
    'Data_Type': [],
    'Model_Name': [],
    'Model_Training_Score' : [],
    'Model_Test_Score': [],
    'Mean_Squared_Error': [],
    'Model_Cross_Val_Score': [],
    'Coefficients/Feature_Importances': [],
    'Grid_Search_Best_Params / Regularization_Params': [],
    'Notes': []
}  
    
    #Update The Data Column With The Type of Data (Numeric vs Text vs All)
    
    if data == 1:
        master_dict['Data_Used'].append('Text + Numerical Data')
    else:
        master_dict['Data_Used'].append('Numerical')
    
    
    #Updating With The Type of Data
        
    if data_type == 'logged':
        master_dict['Data_Type'].append('Logged')
    else:
        master_dict['Data_Type'].append('Non-Logged Data')
        
    
    #Extracting Out All Of The Relevant Information
    
    fitted_model = model.fit(X_train_num, y_train)
    master_dict['Model_Name'].append(str(fitted_model))
    master_dict['Model_Training_Score'].append(fitted_model.score(X_train_num, y_train))
    master_dict['Model_Test_Score'].append(fitted_model.score(X_test_num, y_test))
    predictions = fitted_model.predict(X_test_num) 
    master_dict['Mean_Squared_Error'].append(mean_squared_error(y_test, predictions))
    master_dict['Model_Cross_Val_Score'].append(np.mean(cross_val_score(fitted_model, X_train_num, y_train, cv=5)))

    try:
        master_dict['Coefficients/Feature_Importances'].append(dict(coefficient_values = fitted_model.coef_, 
                                       indexes = X_train_num.columns))
    except:
        master_dict['Coefficients/Feature_Importances'].append(dict(feature_importance_values = fitted_model.feature_importances_, 
                                       indexes = X_train_num.columns))
                      
            
    try:
        master_dict['Grid_Search_Best_Params / Regularization_Params'].append(fitted_model.best_params_)
    except:
        try:
            master_dict['Grid_Search_Best_Params / Regularization_Params'].append(fitted_model.alpha_)
        except:
            master_dict['Grid_Search_Best_Params / Regularization_Params'].append('No Grid Search Used / Regularization Used')
            

    master_dict['Notes'].append(str(fitted_model))
     
    df = pd.DataFrame(master_dict)
    return df

In [73]:
new_df = model_results_new(0, 'logged', ada_regressor, X_train, X_test, y_train, y_test)

In [74]:
new_df

Unnamed: 0,Data_Used,Data_Type,Model_Name,Model_Training_Score,Model_Test_Score,Mean_Squared_Error,Model_Cross_Val_Score,Coefficients/Feature_Importances,Grid_Search_Best_Params / Regularization_Params,Notes
0,Numerical,Logged,AdaBoostRegressor(base_estimator=RandomForestR...,0.979795,0.85963,0.439756,0.856803,{'feature_importance_values': [0.5520762574707...,No Grid Search Used / Regularization Used,AdaBoostRegressor(base_estimator=RandomForestR...


In [75]:
import pickle

In [85]:
results = pickle.load(open('../results.pkl', 'rb'))

In [87]:
results = pd.concat([results, new_df])

In [89]:
pickle.dump(results, open('../results.pkl', 'wb'))

In [108]:
pickle.dump(df, open('merged_web_page_data.pkl', 'wb'))

In [109]:
pickle.load(open('merged_web_page_data.pkl', 'rb'))

Unnamed: 0,Url,Evergreen_Score,Total_Shares,Published_Date,Word_Count,num_linking_domains,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Setences_Text,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Text,Title_Tag_Length,Meta_Description,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive
0,https://gatheringdreams.com/affiliate-marketin...,1.54,8021,2018-08-23,4767,1.0,"Some of the links below are affiliate links, s...",27301,1,0,0,42,1,"['Some of the links below are affiliate links,...",261,4810,46.98,16.8,17.43,15.1,21.2,Affiliate Marketing for Dummies: A Smart Guide...,79,A step-by-step affiliate marketing for dummies...,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1
1,https://itsclaudiag.com/2018/09/how-to-use-aff...,1.44,2569,2018-09-16,1181,2.0,Would you like to make money while you sleep?\...,6519,1,0,0,24,1,['Would you like to make money while you sleep...,65,1154,53.99,14.2,15.29,13.2,17.6,HOW TO USE AFFILIATE MARKETING TO MAKE MONEY B...,69,Do you want to make money while you sleep? Her...,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2
2,https://www.entrepreneur.com/article/319017,5.68,844,2018-09-12,996,12.0,Learn three simple strategies to help you stac...,5916,1,0,0,12,1,['Learn three simple strategies to help you st...,65,1001,46.03,15.1,16.04,14.5,18.6,How to Build a Profitable Business Online by S...,60,Learn three simple strategies to help you stac...,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0
3,https://onlinemediamasters.com/how-to-make-mon...,1.30,775,2018-09-22,8996,3.0,I never thought this would happen to me.\n\nIn...,39589,1,0,0,113,1,"['I never thought this would happen to me.', '...",353,6529,50.70,13.3,13.01,13.5,17.0,How To Make Money With Affiliate Marketing In ...,63,I made $150k last year in affiliate marketing ...,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3
4,https://www.finsavvypanda.com/how-to-make-mone...,1.53,614,2018-10-09,2226,1.0,This post may contain affiliate links. Please ...,12657,1,0,0,28,1,"['This post may contain affiliate links.', 'Pl...",140,2227,59.06,12.2,12.84,13.1,15.2,How To Make Money With Affiliate Marketing For...,73,No Meta_Description,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9
5,https://heartmylife.com/affiliate-marketing-ne...,0.18,538,2018-08-27,1489,0.0,Affiliate marketing is one of the easiest ways...,9050,1,0,0,10,1,['Affiliate marketing is one of the easiest wa...,82,1481,44.11,15.9,16.42,14.7,20.5,The 5 BEST Affiliate Marketing Networks for Ne...,72,Affiliate marketing networks offer bloggers ac...,159,1,80949,9050,11.179879,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.7,9.6
6,https://blogambitious.com/monetize-with-affili...,2.05,533,2018-09-20,698,0.0,Making a full-time income with affiliate marke...,4741,1,0,0,10,1,['Making a full-time income with affiliate mar...,48,811,62.72,10.8,12.83,13.0,14.2,8 Creative Ways to Monetize a Blog with Affili...,59,No Meta_Description,19,1,72053,4741,6.579879,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,4.9,18.5
7,https://juleskalpauli.com/get-paid-pinterest-a...,2.43,514,2018-09-22,2364,0.0,Okay here is the deal my friend… I am simply a...,13014,1,0,0,87,1,['Okay here is the deal my friend… I am simply...,125,2326,42.11,18.7,19.70,14.9,23.2,You Too Can Crush it with Pinterest Affiliate ...,76,Ever Considered that You Too Can Rock Pinteres...,149,1,247706,13014,5.253809,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,48.8
8,https://wakeupandblog.co.uk/blog/5-affiliate-m...,2.07,486,2018-10-01,1188,0.0,Privacy & Cookies: This site uses cookies. By ...,176,1,0,0,15,0,"['Privacy & Cookies: This site uses cookies.',...",2,29,56.76,8.9,11.32,0.0,9.7,5 affiliate marketing programs for bloggers | ...,62,If it's time you started earning money from yo...,154,1,92646,176,0.189970,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3.0,16.3
9,https://blog.hubspot.com/marketing/multi-step-...,2.93,389,2018-10-12,1629,15.0,One time I tried signing up for a spin class a...,9606,1,0,0,20,1,['One time I tried signing up for a spin class...,83,1599,50.40,13.5,13.57,13.7,16.8,Why You Should Create Multi-Step Forms and How...,76,Learn how multi-step forms can help you increa...,137,1,148794,9606,6.455905,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3.9,28.5


In [107]:
df.shape

(14469, 69)