# Module Imports

In [1]:
#Import Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Importing Dependencies
import pandas as pd
import numpy as np
from numpy import nan

import csv
import nltk
import pickle

In [2]:
#Pre-Processing + Custom Class Integration
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin, BaseEstimator


#Model Imports
from sklearn.linear_model import LinearRegression, ElasticNetCV, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor

#Model Boosting
from sklearn.ensemble import BaggingRegressor

#Model Metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error


#GridSearch
from sklearn.model_selection import GridSearchCV

#Scipy Integration for Sparse Matrixes
from scipy import sparse

#Additional Feature Engineering - NLP Text Data Import
from sklearn.feature_extraction.text import TfidfVectorizer

#PipeLine Imports
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


#Multi-threaded Processing
from sklearn.externals import joblib


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
cleaned_df = pd.read_pickle('../Ignore/merged_web_page_data.pkl')
cleaned_df.dropna(axis= 0, inplace = True)
cleaned_df.drop(axis=1, columns = ['Url', 'Published_Date', 'Setences_Text', 'Meta_Description', 'Title_Text' ], inplace=True)

In [5]:
X = cleaned_df
y = cleaned_df.pop('Total_Shares')

# Articles:
- https://medium.com/@dobko_m/nlp-text-data-cleaning-and-preprocessing-ea3ffe0406c1

# Tokenizing Text

In [6]:
#1. Importing Spacy Packages
import spacy
from spacy.lang.en import English
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA

In [226]:
#2. Importing Textblob Packages
from textblob import TextBlob

In [227]:
nlp = spacy.load("en_core_web_lg")
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [206]:
text_1 = 'Paris is a nice city'
text_2 = 'London is a large city'

In [207]:
text =  " Big thanks for posting this tune! It's nice to bring out some new music and hope all of you here can enjoy the new album, thank you :) "


In [219]:
nlp(text_1).similarity(nlp(text_2))

0.8950880863365216

In [220]:
nlp(text_1).similarity(nlp(text))

0.7374955765189001

In [None]:
LDA - Latent Dirichlet Allocation

In [17]:
# np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
# np_array
# np_array.shape

In [228]:
def tokenize_and_parse(cell):
    my_doc = nlp(cell)
    text_blob = TextBlob(cell)
    token_list = []
    token_lemma = []
    pos_tagging =[]
    entity_list = {}
    
    for word in my_doc:
        if word.is_stop==False:
            token_list.append(word.text)
            token_lemma.append(word.lemma_)
            pos_tagging.append(word.pos_)
            
        else:
            pass
        
    # Entity Recognition
    for ent in my_doc.ents:
        entity_list[ent.text] = [ent.label_, ent.start_char, ent.end_char]
        
    sentiment_polarity = text_blob.sentiment[0]
    sentiment_subjectivity = text_blob.sentiment[1]
        
    return (token_list, token_lemma, pos_tagging, entity_list, sentiment_polarity, sentiment_subjectivity)

In [229]:
cleaned_df

Unnamed: 0,Evergreen_Score,Word_Count,num_linking_domains,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive
0,1.54,4767,1.0,"Some of the links below are affiliate links, s...",27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1
1,1.44,1181,2.0,Would you like to make money while you sleep?\...,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2
2,5.68,996,12.0,Learn three simple strategies to help you stac...,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0
3,1.30,8996,3.0,I never thought this would happen to me.\n\nIn...,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3
4,1.53,2226,1.0,This post may contain affiliate links. Please ...,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,454,0.0,Relationships matter more than ever in busines...,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2
14465,0.00,82,0.0,This week we are featuring the school website ...,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7
14466,0.00,565,0.0,Content curation involves searching and gather...,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3
14467,0.00,1024,0.0,"When it comes to communication, email is still...",6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1


In [230]:
awesome = cleaned_df['Article_Text'].apply(tokenize_and_parse)

In [232]:
awesome.shape

(14469,)

In [54]:
len(awesome[0])

6

In [237]:
def descriptive_stats(dataframe, i):
    print('The first row of your dataframe contains the following:')
    print('Text',len(dataframe[i][0]),'Entries')
    print('Text Lemma',len(dataframe[i][1]),'Entries')
    print('POS Tagging',len(dataframe[i][2]),'Entries')
    print(len(dataframe[0][3]),'Matched Entities')
    print('The Sentiment Polarity Score Is:',round(dataframe[i][4], 3))
    print('The Sentiment Subjectivity Score Is:',round(dataframe[i][5], 3))

In [241]:
descriptive_stats(awesome, 20)

The first row of your dataframe contains the following:
Text 21 Entries
Text Lemma 21 Entries
POS Tagging 21 Entries
90 Matched Entities
The Sentiment Polarity Score Is: 0.167
The Sentiment Subjectivity Score Is: 0.833


## Processing The Additional NLP Data

In [13]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import collections

In [15]:
dicts =  []

for i in range(nlp_data.shape[0]):
    temp_list = []
    
    for key, value in nlp_data[i][3].items():
        temp_list.append(value[0])
    dicts.append((dict(collections.Counter(temp_list))))


In [20]:
cleaned_df['Entities'].shape

(14469,)

In [33]:
v = DictVectorizer(sparse=True)
X = v.fit_transform(cleaned_df['Entities'])
entities = pd.DataFrame(X.toarray(), columns = v.get_feature_names())

In [34]:
entities

Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0
1,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0
2,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0
3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0
4,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0
14465,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14466,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0
14467,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0


In [24]:
### The Sentiment Polarity Score
sentiment_polarity_score = nlp_data.map(lambda x: x[4])

### The Sentiment Subjectivity Score
subjectivity_score = nlp_data.map(lambda x: x[5])

In [11]:
# Counter For Different Types of Punctuation 
cleaned_df['Punctuation'] = nlp_data.map(lambda x: dict(collections.Counter(x[2])))

0        ([links, affiliate, links, ,, receive, commiss...
1        ([like, money, sleep, ?, \n\n, answer, yes, ,,...
2        ([Learn, simple, strategies, help, stack, reve...
3        ([thought, happen, ., \n\n, 2, years, went, ma...
4        ([post, contain, affiliate, links, ., read, di...
                               ...                        
14464    ([Relationships, matter, business, ,, believe,...
14465    ([week, featuring, school, website, design, Ea...
14466    ([Content, curation, involves, searching, gath...
14467    ([comes, communication, ,, email, prevalent, c...
14468    ([client, story, ., job, tell, ., \n\n, websit...
Name: Article_Text, Length: 14469, dtype: object

In [52]:
punct_vec = DictVectorizer(sparse=True)
X = punct_vec.fit_transform(cleaned_df['Punctuation'])
punctuation = pd.DataFrame(X.toarray(), columns = punct_vec.get_feature_names())

In [53]:
# Concatenate All Of The DataFrames Together
cleaned_df = pd.concat([cleaned_df, entities, punctuation,], axis = 1)

In [54]:
cleaned_df['sentiment_polarity_score'] = nlp_data.map(lambda x: x[4])
cleaned_df['subjectivity_score'] = nlp_data.map(lambda x: x[5])

------------------------------------------------------------------------------------------------------------------------------------

In [57]:
# Dropping Irrelevant Rows + Columns For The DataFrame
cleaned_df.dropna(axis= 0, inplace = True)

In [61]:
cleaned_df.select_dtypes('object')

Unnamed: 0,Entities,Punctuation
0,"{'ORG': 22, 'ORDINAL': 2, 'DATE': 26, 'MONEY':...","{'NOUN': 1006, 'PUNCT': 508, 'VERB': 603, 'SPA..."
1,"{'DATE': 7, 'CARDINAL': 9, 'MONEY': 3, 'ORG': ...","{'VERB': 156, 'NOUN': 250, 'PUNCT': 145, 'SPAC..."
2,"{'CARDINAL': 10, 'DATE': 10, 'PERCENT': 3, 'PE...","{'VERB': 121, 'ADJ': 50, 'NOUN': 222, 'PUNCT':..."
3,"{'DATE': 22, 'MONEY': 29, 'ORG': 124, 'PERCENT...","{'VERB': 779, 'PUNCT': 1202, 'SPACE': 241, 'NU..."
4,"{'PERSON': 3, 'WORK_OF_ART': 3, 'MONEY': 16, '...","{'NOUN': 472, 'VERB': 276, 'PUNCT': 286, 'SPAC..."
...,...,...
14464,"{'ORG': 5, 'CARDINAL': 4, 'GPE': 1, 'LANGUAGE'...","{'NOUN': 112, 'VERB': 42, 'PUNCT': 55, 'PROPN'..."
14465,"{'DATE': 1, 'GPE': 1}","{'NOUN': 28, 'VERB': 10, 'PROPN': 3, 'PUNCT': ..."
14466,"{'ORDINAL': 1, 'DATE': 2, 'PERCENT': 4, 'ORG':...","{'NOUN': 126, 'VERB': 76, 'ADJ': 38, 'PUNCT': ..."
14467,"{'ORG': 9, 'CARDINAL': 5, 'DATE': 10, 'PERCENT...","{'VERB': 118, 'NOUN': 287, 'PUNCT': 132, 'ADJ'..."


In [60]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,Entities,Punctuation,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,"{'ORG': 22, 'ORDINAL': 2, 'DATE': 26, 'MONEY':...","{'NOUN': 1006, 'PUNCT': 508, 'VERB': 603, 'SPA...",9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,"{'DATE': 7, 'CARDINAL': 9, 'MONEY': 3, 'ORG': ...","{'VERB': 156, 'NOUN': 250, 'PUNCT': 145, 'SPAC...",9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,"{'CARDINAL': 10, 'DATE': 10, 'PERCENT': 3, 'PE...","{'VERB': 121, 'ADJ': 50, 'NOUN': 222, 'PUNCT':...",10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,"{'DATE': 22, 'MONEY': 29, 'ORG': 124, 'PERCENT...","{'VERB': 779, 'PUNCT': 1202, 'SPACE': 241, 'NU...",27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,"{'PERSON': 3, 'WORK_OF_ART': 3, 'MONEY': 16, '...","{'NOUN': 472, 'VERB': 276, 'PUNCT': 286, 'SPAC...",5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,"{'ORG': 5, 'CARDINAL': 4, 'GPE': 1, 'LANGUAGE'...","{'NOUN': 112, 'VERB': 42, 'PUNCT': 55, 'PROPN'...",4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,"{'DATE': 1, 'GPE': 1}","{'NOUN': 28, 'VERB': 10, 'PROPN': 3, 'PUNCT': ...",0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,"{'ORDINAL': 1, 'DATE': 2, 'PERCENT': 4, 'ORG':...","{'NOUN': 126, 'VERB': 76, 'ADJ': 38, 'PUNCT': ...",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,"{'ORG': 9, 'CARDINAL': 5, 'DATE': 10, 'PERCENT...","{'VERB': 118, 'NOUN': 287, 'PUNCT': 132, 'ADJ'...",5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


In [62]:
# Cleaning The New Merged DataFrame
cleaned_df.dropna(axis= 0, inplace = True)
cleaned_df.drop(axis=1, columns = ['Entities', 'Punctuation'], inplace=True)

In [63]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


In [64]:
pickle.dump(cleaned_df ,open('final_df.pkl', 'wb'))

In [65]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


## # Modelling 

In [95]:
cleaned_df = pickle.load(open('../Ignore/final_df.pkl', 'rb'))

In [77]:
# cleaned_df['Total_Shares'] = cleaned_df['Total_Shares'].apply(lambda x: np.log1p(x))

In [96]:
X = cleaned_df

In [97]:
y = cleaned_df.pop('Total_Shares')

In [98]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.2, random_state= 3)

In [99]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [100]:
RFR = RandomForestRegressor(n_estimators = 300)

In [101]:
RFR.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [102]:
RFR.score(X_train, y_train)

0.8748533235948093

In [103]:
RFR.score(X_test, y_test)

0.33388301343875015

In [None]:
# Bad Scores. Too Many Features!!! Let's Apply PCA - Principaal Component Analysis : ) 

In [104]:
from sklearn.decomposition import PCA

In [108]:
# instantiate PCA with n_components
pca = PCA(n_components=25)

In [109]:
# project on the principal components and insert results into a dataframe
cols = ['PC_{}'.format(i) for i in range(1, pca.n_components+1)]
X_pca = pd.DataFrame(pca.fit_transform(X), columns=cols)
X_pca.head()

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,PC_11,PC_12,PC_13,PC_14,PC_15,PC_16,PC_17,PC_18,PC_19,PC_20,PC_21,PC_22,PC_23,PC_24,PC_25
0,73444.414564,33486.756228,420.589183,-120.62979,-239.859511,-403.002634,-213.136989,-85.709559,178.382246,57.359087,146.69804,38.400175,117.726772,-64.85507,-2.591715,6.797683,0.866789,4.157726,-11.01934,-12.478155,-41.301597,-15.888602,14.332977,-0.134874,1.025856
1,-16588.538907,3866.78995,105.60448,-26.551845,-52.540055,-92.226396,-54.432674,-42.680749,46.334042,38.431676,4.235707,-2.246526,11.822161,-24.747586,-34.049869,11.46615,-0.997613,-0.926819,-6.55095,13.976771,9.689635,3.087363,-0.401275,-8.298081,-7.101586
2,-27098.888838,3013.979969,13.161667,-20.394422,-47.901729,-77.225615,-45.426666,-20.466734,25.05349,3.362172,-36.369049,-13.365386,32.223235,1.179616,-10.14525,-0.762076,3.386341,11.204156,0.462617,2.60528,-12.066543,6.025377,11.546058,-1.177945,3.631001
3,308957.00315,50714.071187,2519.578374,-339.819873,-934.986992,-156.55096,496.013164,-81.362258,-16.749171,273.42978,-380.945585,-5.267215,39.440836,57.667224,-71.649189,81.216295,65.371023,-52.878036,-28.319195,-40.381698,10.41966,7.651971,-24.339574,-11.773064,-64.321144
4,36411.957608,12542.314144,139.85655,-46.583405,-228.183244,-169.483771,-82.644406,-67.842578,80.107779,50.617161,3.248679,9.844598,21.843976,10.431158,-36.203608,13.281353,15.827679,20.869091,3.908937,-0.009262,-5.982935,-4.943645,3.984123,-1.450159,4.543561


In [110]:
# Get the most important pieces of information
print("Number of principal components:\t\t", pca.n_components_)
print("Explained variance:\t\t\t", np.round(pca.explained_variance_, 3))
print("Explained variance ratio:\t\t", np.round(
    pca.explained_variance_ratio_, 3))
print("Cumulative explained variance ratio:\t", np.round(
    np.cumsum(pca.explained_variance_ratio_), 3))

Number of principal components:		 25
Explained variance:			 [2.53503399e+10 4.44315686e+07 2.73837256e+05 1.71589490e+05
 5.50967740e+04 4.05583240e+04 1.67179160e+04 5.70872900e+03
 4.51462000e+03 2.24975200e+03 1.99783900e+03 1.18960700e+03
 7.20188000e+02 5.17464000e+02 3.63759000e+02 2.82283000e+02
 2.37939000e+02 1.33338000e+02 8.81980000e+01 7.17390000e+01
 5.64580000e+01 5.06800000e+01 4.89480000e+01 4.28750000e+01
 3.73570000e+01]
Explained variance ratio:		 [0.998 0.002 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
 0.   ]
Cumulative explained variance ratio:	 [0.998 1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.
 1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.
 1.   ]


In [None]:
### Let's 