### Debugging The Pipelines vs Standard Code For The Text Data

In [242]:
#Import Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Importing Dependencies
import pandas as pd
import numpy as np
from numpy import nan

import csv
import nltk
import pickle

In [243]:
#Pre-Processing + Custom Class Integration
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin, BaseEstimator


#Model Imports
from sklearn.linear_model import LinearRegression, ElasticNetCV, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor

#Model Boosting
from sklearn.ensemble import BaggingRegressor

#Model Metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error


#GridSearch
from sklearn.model_selection import GridSearchCV

#Scipy Integration for Sparse Matrixes
from scipy import sparse

#Additional Feature Engineering - NLP Text Data Import
from sklearn.feature_extraction.text import TfidfVectorizer

#PipeLine Imports
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


#Multi-threaded Processing
from sklearn.externals import joblib

In [244]:
pd.set_option('display.max_columns', 500)

In [245]:
class TFID_Vectorizer(BaseEstimator,TransformerMixin):
    
    def __init__(self, X, ngram_range = None, min_df = None, max_df = None, 
                 max_features= None):
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_df = max_df
        self.max_features = max_features
        self.tvec = TfidfVectorizer(stop_words='english', ngram_range = self.ngram_range, min_df= self.min_df, 
                                    max_df = self.max_df, max_features = self.max_features)
    
    def fit(self, X, *args):
        self.text_columns = X.select_dtypes('object')['Article_Text']
        self.tvec = self.tvec.fit(self.text_columns)
        return self
    
    
    def transform(self, X, *args):
        self.text_columns = X.select_dtypes('object')['Article_Text']
        self.numerical_columns = X[X.columns.difference(['Article_Text'])]
        
        X = self.tvec.transform(self.text_columns)
        X = pd.DataFrame(X.toarray())
        X = pd.concat([X, self.numerical_columns], axis = 1)
        return X

In [246]:
New_Scaler = Standard_Scaler(cleaned_df)
TFID = TFID_Vectorizer(cleaned_df, (0,4), 0.01, 0.5, None)

------------------------------------------------------------------------------------------------------------------------------------------------

## 1. Testing That The TVEC + StandardScaler Give The Same Scores As The Manual Approach (To Do)

In [247]:
cleaned_df = pd.read_pickle('Ignore/cleaned_data.pkl')
cleaned_df.drop(axis=1, columns = ['Url', 'Published_Date', 'Setences_Text', 'Meta_Description', 'Title_Text' ], inplace=True)
cleaned_df.dropna(axis= 0, inplace = True)

In [248]:
#1 Initialize The Scaler
scaler = StandardScaler()
X = cleaned_df
y = cleaned_df.pop('Total_Shares')

In [249]:
#2 Setup A Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.2, random_state= 3)

In [250]:
X_train.head(3)

Unnamed: 0,Evergreen_Score,Word_Count,num_linking_domains,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']"
2334,0.18,864,0.0,"""Am I Wasting Money on SEO Copywriting?""\n\nWh...",3649,1,0,0,3,1,34,594,46.3,13.0,13.08,14.6,15.2,56,19,1,75416,3649,4.838496,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
12357,0.17,829,2.0,Italian restaurant chain Carluccio’s has manag...,4749,1,0,0,3,1,43,828,58.35,12.5,14.3,12.7,15.7,101,19,1,54594,4749,8.698758,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10744,0.0,87,0.0,"In today's episode, John Di Lemme Reveals the ...",511,1,0,0,3,0,4,86,53.21,16.5,19.06,0.0,23.5,132,516,1,39372,511,1.297877,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [251]:
#3 Resetting All Of The Indexes
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [252]:
X_train.head(3)

Unnamed: 0,Evergreen_Score,Word_Count,num_linking_domains,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']"
0,0.18,864,0.0,"""Am I Wasting Money on SEO Copywriting?""\n\nWh...",3649,1,0,0,3,1,34,594,46.3,13.0,13.08,14.6,15.2,56,19,1,75416,3649,4.838496,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.17,829,2.0,Italian restaurant chain Carluccio’s has manag...,4749,1,0,0,3,1,43,828,58.35,12.5,14.3,12.7,15.7,101,19,1,54594,4749,8.698758,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,87,0.0,"In today's episode, John Di Lemme Reveals the ...",511,1,0,0,3,0,4,86,53.21,16.5,19.06,0.0,23.5,132,516,1,39372,511,1.297877,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [253]:
#3 Performing TFID Vectorizing Upon The Text Data
vectorizer = TfidfVectorizer(stop_words = 'english'
                            ,min_df=3, 
                             ngram_range=(1, 4),
                             max_features = 1000,
                             max_df = 1.0
                            )


In [254]:
fitted_vectorizer = vectorizer.fit(X_train['Article_Text'])

In [255]:
X_train_Article_Text = fitted_vectorizer.transform(X_train['Article_Text'])

In [256]:
X_train_Article_Text

<12740x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1176517 stored elements in Compressed Sparse Row format>

In [257]:
X_test_Article_Text = fitted_vectorizer.transform(X_test['Article_Text'])

In [258]:
X_test_Article_Text

<3186x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 297425 stored elements in Compressed Sparse Row format>

In [259]:
# Storing Both Items To Pickle Objects

In [260]:
# f = open('X_train_Article_Text.pkl', 'wb')
# pickle.dump(X_train_Article_Text, f)
# f.close()

In [261]:
# f = open('X_test_Article_Text.pkl', 'wb')
# pickle.dump(X_test_Article_Text, f)
# f.close()

--------------------------------------------------------------------------------------------------------------------------------

In [262]:
# Drop The Additional Text Columns - Potentially Revert This :)
X_train.drop(axis=1, columns = ['Article_Text'], inplace= True)
X_test.drop(axis=1, columns = ['Article_Text'], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [263]:
test_one = pd.DataFrame(X_train_Article_Text.toarray())
test_two = pd.DataFrame(X_test_Article_Text.toarray())

In [264]:
X_train = pd.concat([X_train, test_one], axis=1)
X_test = pd.concat([X_test, test_two], axis=1)

In [265]:
X_train.shape

(12740, 1060)

In [266]:
X_test.shape

(3186, 1060)

In [267]:
#4 Apply Standardized Scaling To The Original Matrix.
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [268]:
Lasso_Test = LassoCV(n_alphas=50, random_state = 3)

In [269]:
Lasso_Test.fit(X_train_std, y_train)



LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=50, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=3, selection='cyclic',
        tol=0.0001, verbose=False)

In [270]:
Lasso_Test.score(X_train_std, y_train)

0.2754593818819344

In [271]:
Lasso_Test.score(X_test_std, y_test)

0.24910368045352982

In [272]:
np.mean(cross_val_score(Lasso_Test, X_train_std, y_train))



0.3776949522969966

## Testing The Pipeline Vs The Manual Approach Above 

In [273]:
TFID = TFID_Vectorizer(cleaned_df, (1,4), 3, 1.0, 1000)

In [274]:
cleaned_df = pd.read_pickle('Ignore/cleaned_data.pkl')
cleaned_df.drop(axis=1, columns = ['Url', 'Published_Date', 'Setences_Text', 'Meta_Description', 'Title_Text' ], inplace=True)
cleaned_df.dropna(axis= 0, inplace = True)

In [275]:
X = cleaned_df
y = cleaned_df.pop('Total_Shares')

In [276]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.2, random_state= 3)

In [277]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [278]:
X_train.head(3)

Unnamed: 0,Evergreen_Score,Word_Count,num_linking_domains,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']"
0,0.18,864,0.0,"""Am I Wasting Money on SEO Copywriting?""\n\nWh...",3649,1,0,0,3,1,34,594,46.3,13.0,13.08,14.6,15.2,56,19,1,75416,3649,4.838496,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.17,829,2.0,Italian restaurant chain Carluccio’s has manag...,4749,1,0,0,3,1,43,828,58.35,12.5,14.3,12.7,15.7,101,19,1,54594,4749,8.698758,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.0,87,0.0,"In today's episode, John Di Lemme Reveals the ...",511,1,0,0,3,0,4,86,53.21,16.5,19.06,0.0,23.5,132,516,1,39372,511,1.297877,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [279]:
pipe_lasso = Pipeline([('TFID_vec', TFID),
                    ('StandardScaler', StandardScaler() ),
                   ('Model', LassoCV(n_alphas=50, random_state = 3 ))])

In [280]:
pipe_lasso.fit(X_train, y_train)
pipe_lasso.score(X_train, y_train)



0.2754593818394203

In [281]:
pipe_lasso.score(X_test, y_test)

0.24910368042593178

------------------------------------------------------------------------------------------------------------------------------------------------