# Module Imports

In [1]:
#Import Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Importing Dependencies
import pandas as pd
import numpy as np
from numpy import nan

import csv
import nltk
import pickle

In [2]:
#Pre-Processing + Custom Class Integration
from sklearn.preprocessing import StandardScaler
from sklearn.base import TransformerMixin, BaseEstimator


#Model Imports
from sklearn.linear_model import LinearRegression, ElasticNetCV, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor

#Model Boosting
from sklearn.ensemble import BaggingRegressor

#Model Metrics
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error


#GridSearch
from sklearn.model_selection import GridSearchCV

#Scipy Integration for Sparse Matrixes
from scipy import sparse

#Additional Feature Engineering - NLP Text Data Import
from sklearn.feature_extraction.text import TfidfVectorizer

#PipeLine Imports
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


#Multi-threaded Processing
from sklearn.externals import joblib


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

In [22]:
pd.set_option('display.max_columns', 500)

In [4]:
cleaned_df = pd.read_pickle('../Ignore/merged_web_page_data.pkl')
cleaned_df.dropna(axis= 0, inplace = True)
cleaned_df.drop(axis=1, columns = ['Url', 'Published_Date', 'Setences_Text', 'Meta_Description', 'Title_Text' ], inplace=True)

In [5]:
X = cleaned_df
y = cleaned_df.pop('Total_Shares')

# Articles:
- https://medium.com/@dobko_m/nlp-text-data-cleaning-and-preprocessing-ea3ffe0406c1

# Tokenizing Text

In [6]:
#1. Importing Spacy Packages
import spacy
from spacy.lang.en import English
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA

In [226]:
#2. Importing Textblob Packages
from textblob import TextBlob

In [227]:
nlp = spacy.load("en_core_web_lg")
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [206]:
text_1 = 'Paris is a nice city'
text_2 = 'London is a large city'

In [207]:
text =  " Big thanks for posting this tune! It's nice to bring out some new music and hope all of you here can enjoy the new album, thank you :) "


In [219]:
nlp(text_1).similarity(nlp(text_2))

0.8950880863365216

In [220]:
nlp(text_1).similarity(nlp(text))

0.7374955765189001

In [None]:
LDA - Latent Dirichlet Allocation

In [17]:
# np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
# np_array
# np_array.shape

In [228]:
def tokenize_and_parse(cell):
    my_doc = nlp(cell)
    text_blob = TextBlob(cell)
    token_list = []
    token_lemma = []
    pos_tagging =[]
    entity_list = {}
    
    for word in my_doc:
        if word.is_stop==False:
            token_list.append(word.text)
            token_lemma.append(word.lemma_)
            pos_tagging.append(word.pos_)
            
        else:
            pass
        
    # Entity Recognition
    for ent in my_doc.ents:
        entity_list[ent.text] = [ent.label_, ent.start_char, ent.end_char]
        
    sentiment_polarity = text_blob.sentiment[0]
    sentiment_subjectivity = text_blob.sentiment[1]
        
    return (token_list, token_lemma, pos_tagging, entity_list, sentiment_polarity, sentiment_subjectivity)

In [229]:
cleaned_df

Unnamed: 0,Evergreen_Score,Word_Count,num_linking_domains,Article_Text,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive
0,1.54,4767,1.0,"Some of the links below are affiliate links, s...",27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1
1,1.44,1181,2.0,Would you like to make money while you sleep?\...,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2
2,5.68,996,12.0,Learn three simple strategies to help you stac...,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0
3,1.30,8996,3.0,I never thought this would happen to me.\n\nIn...,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3
4,1.53,2226,1.0,This post may contain affiliate links. Please ...,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,454,0.0,Relationships matter more than ever in busines...,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2
14465,0.00,82,0.0,This week we are featuring the school website ...,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7
14466,0.00,565,0.0,Content curation involves searching and gather...,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3
14467,0.00,1024,0.0,"When it comes to communication, email is still...",6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1


In [230]:
awesome = cleaned_df['Article_Text'].apply(tokenize_and_parse)

In [232]:
awesome.shape

(14469,)

In [54]:
len(awesome[0])

6

In [237]:
def descriptive_stats(dataframe, i):
    print('The first row of your dataframe contains the following:')
    print('Text',len(dataframe[i][0]),'Entries')
    print('Text Lemma',len(dataframe[i][1]),'Entries')
    print('POS Tagging',len(dataframe[i][2]),'Entries')
    print(len(dataframe[0][3]),'Matched Entities')
    print('The Sentiment Polarity Score Is:',round(dataframe[i][4], 3))
    print('The Sentiment Subjectivity Score Is:',round(dataframe[i][5], 3))

In [241]:
descriptive_stats(awesome, 20)

The first row of your dataframe contains the following:
Text 21 Entries
Text Lemma 21 Entries
POS Tagging 21 Entries
90 Matched Entities
The Sentiment Polarity Score Is: 0.167
The Sentiment Subjectivity Score Is: 0.833


## Processing The Additional NLP Data

In [13]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import collections

In [15]:
dicts =  []

for i in range(nlp_data.shape[0]):
    temp_list = []
    
    for key, value in nlp_data[i][3].items():
        temp_list.append(value[0])
    dicts.append((dict(collections.Counter(temp_list))))


In [20]:
cleaned_df['Entities'].shape

(14469,)

In [33]:
v = DictVectorizer(sparse=True)
X = v.fit_transform(cleaned_df['Entities'])
entities = pd.DataFrame(X.toarray(), columns = v.get_feature_names())

In [34]:
entities

Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0
1,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0
2,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0
3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0
4,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0
14465,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14466,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0
14467,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0


In [24]:
### The Sentiment Polarity Score
sentiment_polarity_score = nlp_data.map(lambda x: x[4])

### The Sentiment Subjectivity Score
subjectivity_score = nlp_data.map(lambda x: x[5])

In [11]:
# Counter For Different Types of Punctuation 
cleaned_df['Punctuation'] = nlp_data.map(lambda x: dict(collections.Counter(x[2])))

0        ([links, affiliate, links, ,, receive, commiss...
1        ([like, money, sleep, ?, \n\n, answer, yes, ,,...
2        ([Learn, simple, strategies, help, stack, reve...
3        ([thought, happen, ., \n\n, 2, years, went, ma...
4        ([post, contain, affiliate, links, ., read, di...
                               ...                        
14464    ([Relationships, matter, business, ,, believe,...
14465    ([week, featuring, school, website, design, Ea...
14466    ([Content, curation, involves, searching, gath...
14467    ([comes, communication, ,, email, prevalent, c...
14468    ([client, story, ., job, tell, ., \n\n, websit...
Name: Article_Text, Length: 14469, dtype: object

In [52]:
punct_vec = DictVectorizer(sparse=True)
X = punct_vec.fit_transform(cleaned_df['Punctuation'])
punctuation = pd.DataFrame(X.toarray(), columns = punct_vec.get_feature_names())

In [53]:
# Concatenate All Of The DataFrames Together
cleaned_df = pd.concat([cleaned_df, entities, punctuation,], axis = 1)

In [54]:
cleaned_df['sentiment_polarity_score'] = nlp_data.map(lambda x: x[4])
cleaned_df['subjectivity_score'] = nlp_data.map(lambda x: x[5])

------------------------------------------------------------------------------------------------------------------------------------

In [57]:
# Dropping Irrelevant Rows + Columns For The DataFrame
cleaned_df.dropna(axis= 0, inplace = True)

In [61]:
cleaned_df.select_dtypes('object')

Unnamed: 0,Entities,Punctuation
0,"{'ORG': 22, 'ORDINAL': 2, 'DATE': 26, 'MONEY':...","{'NOUN': 1006, 'PUNCT': 508, 'VERB': 603, 'SPA..."
1,"{'DATE': 7, 'CARDINAL': 9, 'MONEY': 3, 'ORG': ...","{'VERB': 156, 'NOUN': 250, 'PUNCT': 145, 'SPAC..."
2,"{'CARDINAL': 10, 'DATE': 10, 'PERCENT': 3, 'PE...","{'VERB': 121, 'ADJ': 50, 'NOUN': 222, 'PUNCT':..."
3,"{'DATE': 22, 'MONEY': 29, 'ORG': 124, 'PERCENT...","{'VERB': 779, 'PUNCT': 1202, 'SPACE': 241, 'NU..."
4,"{'PERSON': 3, 'WORK_OF_ART': 3, 'MONEY': 16, '...","{'NOUN': 472, 'VERB': 276, 'PUNCT': 286, 'SPAC..."
...,...,...
14464,"{'ORG': 5, 'CARDINAL': 4, 'GPE': 1, 'LANGUAGE'...","{'NOUN': 112, 'VERB': 42, 'PUNCT': 55, 'PROPN'..."
14465,"{'DATE': 1, 'GPE': 1}","{'NOUN': 28, 'VERB': 10, 'PROPN': 3, 'PUNCT': ..."
14466,"{'ORDINAL': 1, 'DATE': 2, 'PERCENT': 4, 'ORG':...","{'NOUN': 126, 'VERB': 76, 'ADJ': 38, 'PUNCT': ..."
14467,"{'ORG': 9, 'CARDINAL': 5, 'DATE': 10, 'PERCENT...","{'VERB': 118, 'NOUN': 287, 'PUNCT': 132, 'ADJ'..."


In [60]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,Entities,Punctuation,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,"{'ORG': 22, 'ORDINAL': 2, 'DATE': 26, 'MONEY':...","{'NOUN': 1006, 'PUNCT': 508, 'VERB': 603, 'SPA...",9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,"{'DATE': 7, 'CARDINAL': 9, 'MONEY': 3, 'ORG': ...","{'VERB': 156, 'NOUN': 250, 'PUNCT': 145, 'SPAC...",9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,"{'CARDINAL': 10, 'DATE': 10, 'PERCENT': 3, 'PE...","{'VERB': 121, 'ADJ': 50, 'NOUN': 222, 'PUNCT':...",10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,"{'DATE': 22, 'MONEY': 29, 'ORG': 124, 'PERCENT...","{'VERB': 779, 'PUNCT': 1202, 'SPACE': 241, 'NU...",27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,"{'PERSON': 3, 'WORK_OF_ART': 3, 'MONEY': 16, '...","{'NOUN': 472, 'VERB': 276, 'PUNCT': 286, 'SPAC...",5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,"{'ORG': 5, 'CARDINAL': 4, 'GPE': 1, 'LANGUAGE'...","{'NOUN': 112, 'VERB': 42, 'PUNCT': 55, 'PROPN'...",4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,"{'DATE': 1, 'GPE': 1}","{'NOUN': 28, 'VERB': 10, 'PROPN': 3, 'PUNCT': ...",0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,"{'ORDINAL': 1, 'DATE': 2, 'PERCENT': 4, 'ORG':...","{'NOUN': 126, 'VERB': 76, 'ADJ': 38, 'PUNCT': ...",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,"{'ORG': 9, 'CARDINAL': 5, 'DATE': 10, 'PERCENT...","{'VERB': 118, 'NOUN': 287, 'PUNCT': 132, 'ADJ'...",5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


In [62]:
# Cleaning The New Merged DataFrame
cleaned_df.dropna(axis= 0, inplace = True)
cleaned_df.drop(axis=1, columns = ['Entities', 'Punctuation'], inplace=True)

In [63]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


In [64]:
pickle.dump(cleaned_df ,open('final_df.pkl', 'wb'))

In [65]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


## # Modelling 

In [23]:
cleaned_df = pickle.load(open('../Ignore/final_df.pkl', 'rb'))

In [24]:
cleaned_df

Unnamed: 0,Evergreen_Score,Total_Shares,Word_Count,num_linking_domains,Article_Text_Length,Has_Top_Image,Number_of_Movies,Article_Is_Media_News,Number_Of_Images,Is_Valid_Body,Number_Of_Sentences,Lexicon_Count,Flesch_Reading_Ease_formula,Flesch_Kincaid_Grade_Level,FOG_Scale,SMOG_Index,ARI_Index,Title_Tag_Length,Meta_Description_Length,SSL,Page_Size_In_Bytes,Plain_Text_Size,Plain_Text_Rate,Has_Referring_Domains,Has_Article_Amplifiers,Has_Author_Name,Topic_Content Marketing,Topic_Copywriting,Topic_Display Advertising,Topic_Email Marketing,Topic_Growth Marketing,Topic_Influencer Marketing,Topic_Link Building,Topic_Marketing Automation,Topic_Podcast Marketing,Topic_Search Engine Marketing,Topic_Social Media Marketing,Topic_Video Marketing,Topic_Website Design,Encoding_ISO-8859-1,Encoding_UTF-8,Encoding_iso-8859-1,Encoding_utf-8,Encoding_windows-1252,Number_Of_Article_Amplifiers,"article_types_['how_to_article', 'general_article']","article_types_['how_to_article', 'infographic', 'general_article']","article_types_['how_to_article', 'list', 'general_article']","article_types_['how_to_article', 'newsletter', 'general_article']","article_types_['how_to_article', 'what_post', 'general_article']","article_types_['how_to_article', 'why_post', 'general_article']","article_types_['infographic', 'general_article']","article_types_['list', 'general_article']","article_types_['list', 'infographic', 'general_article']","article_types_['list', 'newsletter', 'general_article']","article_types_['newsletter', 'general_article']","article_types_['what_post', 'general_article']","article_types_['what_post', 'infographic', 'general_article']","article_types_['what_post', 'newsletter', 'general_article']","article_types_['why_post', 'general_article']","article_types_['why_post', 'newsletter', 'general_article']",First_Contentful_Paint,First_Interactive,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,PRON,PROPN,PUNCT,SPACE,SYM,VERB,X,CARDINAL.1,DATE.1,EVENT.1,FAC.1,GPE.1,LANGUAGE.1,LAW.1,LOC.1,MONEY.1,NORP.1,ORDINAL.1,ORG.1,PERCENT.1,PERSON.1,PRODUCT.1,QUANTITY.1,TIME.1,WORK_OF_ART.1,ADJ.1,ADP.1,ADV.1,AUX.1,CCONJ.1,DET.1,INTJ.1,NOUN.1,NUM.1,PART.1,PRON.1,PROPN.1,PUNCT.1,SPACE.1,SYM.1,VERB.1,X.1,sentiment_polarity_score,subjectivity_score
0,1.54,8021,4767,1.0,27301,1,0,0,42,1,261,4810,46.98,16.8,17.43,15.1,21.2,79,151,1,186434,27301,14.643788,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.5,16.1,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,9.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,2.0,22.0,2.0,5.0,1.0,0.0,3.0,3.0,254.0,8.0,63.0,4.0,6.0,0.0,3.0,1006.0,70.0,6.0,0.0,172.0,508.0,227.0,27.0,603.0,0.0,0.281762,0.559670
1,1.44,2569,1181,2.0,6519,1,0,0,24,1,65,1154,53.99,14.2,15.29,13.2,17.6,69,150,1,96481,6519,6.756771,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.2,9.2,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,9.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,10.0,0.0,1.0,0.0,0.0,2.0,0.0,36.0,1.0,35.0,3.0,0.0,0.0,1.0,250.0,15.0,0.0,0.0,42.0,145.0,46.0,4.0,156.0,1.0,0.269613,0.476775
2,5.68,844,996,12.0,5916,1,0,0,12,1,65,1001,46.03,15.1,16.04,14.5,18.6,60,122,1,85973,5916,6.881230,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.2,13.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,3.0,2.0,3.0,2.0,2.0,1.0,0.0,0.0,50.0,2.0,12.0,0.0,0.0,0.0,2.0,222.0,26.0,0.0,0.0,24.0,149.0,46.0,5.0,121.0,1.0,0.175732,0.485097
3,1.30,775,8996,3.0,39589,1,0,0,113,1,353,6529,50.70,13.3,13.01,13.5,17.0,63,114,1,421899,39589,9.383525,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.5,18.3,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,27.0,22.0,3.0,1.0,4.0,1.0,2.0,1.0,29.0,0.0,2.0,124.0,23.0,17.0,11.0,1.0,2.0,8.0,342.0,17.0,120.0,4.0,16.0,0.0,8.0,1544.0,204.0,21.0,2.0,706.0,1202.0,241.0,118.0,779.0,40.0,0.195131,0.489689
4,1.53,614,2226,1.0,12657,1,0,0,28,1,140,2227,59.06,12.2,12.84,13.1,15.2,73,19,1,149458,12657,8.468600,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.7,21.9,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,5.0,16.0,1.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,6.0,0.0,3.0,1.0,1.0,0.0,3.0,101.0,3.0,40.0,3.0,3.0,0.0,4.0,472.0,48.0,4.0,1.0,85.0,286.0,66.0,16.0,276.0,0.0,0.252478,0.494659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,0.00,0,454,0.0,2742,1,0,0,17,1,25,447,53.24,12.4,13.74,13.8,16.0,68,19,1,25294,2742,10.840516,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.4,7.2,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,4.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1.0,8.0,0.0,0.0,0.0,0.0,112.0,2.0,1.0,0.0,28.0,55.0,10.0,0.0,42.0,0.0,0.194975,0.392125
14465,0.00,0,82,0.0,506,1,0,0,3,0,5,82,63.29,8.5,10.46,12.0,11.2,49,371,1,48317,506,1.047250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,11.7,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,3.0,9.0,0.0,0.0,10.0,0.0,0.303939,0.737576
14466,0.00,0,565,0.0,3399,1,0,0,17,1,24,557,33.82,17.8,18.43,17.4,21.3,70,19,1,26073,3399,13.036475,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.3,7.3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,4.0,0.0,0.0,0.0,0.0,1.0,38.0,1.0,8.0,0.0,1.0,0.0,0.0,126.0,5.0,1.0,0.0,26.0,49.0,12.0,0.0,76.0,2.0,0.189242,0.476787
14467,0.00,0,1024,0.0,6079,1,0,0,17,1,86,1016,60.45,9.6,10.91,11.8,11.8,77,19,1,29614,6079,20.527453,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.5,7.1,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,5.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,17.0,0.0,0.0,1.0,0.0,0.0,60.0,0.0,17.0,0.0,0.0,0.0,0.0,287.0,37.0,4.0,0.0,29.0,132.0,47.0,1.0,118.0,1.0,0.113672,0.452116


In [77]:
# cleaned_df['Total_Shares'] = cleaned_df['Total_Shares'].apply(lambda x: np.log1p(x))

In [4]:
X = cleaned_df

In [6]:
scaler = StandardScaler()
scaled_X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [26]:
y = cleaned_df.pop('Total_Shares')

In [98]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.2, random_state= 3)

In [99]:
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Bad Scores. Too Many Features!!! Let's Apply PCA - Principaal Component Analysis : ) 

In [7]:
from sklearn.decomposition import PCA

In [17]:
# instantiate PCA with n_components
pca = PCA(n_components=70)

In [18]:
# project on the principal components and insert results into a dataframe
cols = ['PC_{}'.format(i) for i in range(1, pca.n_components+1)]
X_pca = pd.DataFrame(pca.fit_transform(scaled_X), columns=cols)
X_pca.head()

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,...,PC_61,PC_62,PC_63,PC_64,PC_65,PC_66,PC_67,PC_68,PC_69,PC_70
0,22.897322,-9.85602,2.206062,-3.041198,9.183829,4.579194,-3.098844,-4.140557,0.896826,2.211858,...,-2.489614,-3.806036,-0.74733,9.627082,0.088569,-2.039439,9.762247,-5.040529,2.43089,-3.001091
1,3.430385,-3.790349,-0.010185,-0.469734,2.431999,0.014316,-0.494589,-0.744057,-0.021049,1.918573,...,0.569737,-0.963253,-1.662566,1.493749,1.456585,-1.495876,2.101804,-1.441439,1.663702,-2.219556
2,4.162953,-3.104037,0.421728,-0.463257,6.133329,-1.418618,-0.194001,-0.599075,-0.875544,-0.658393,...,-0.344356,-0.941823,-1.337478,0.215083,0.058984,-0.377935,-0.599013,0.864261,0.165494,-0.485595
3,51.581996,-6.272972,15.855378,-18.035964,18.603284,16.985091,-7.199524,-13.224074,0.152729,-6.87373,...,-0.829928,-0.061753,5.211358,1.773822,-5.714347,4.635215,2.272061,2.988511,1.906473,-0.278297
4,10.638853,-3.733727,1.93665,-4.532739,5.950603,5.687915,-4.268603,-1.976799,-0.609807,-0.248527,...,-3.140253,-3.35584,-0.632473,1.722638,0.796377,-2.290369,-2.403994,-0.271146,0.51498,-2.642455


In [19]:
# Get the most important pieces of information
print("Number of principal components:\t\t", pca.n_components_)
print("Explained variance:\t\t\t", np.round(pca.explained_variance_, 3))
print("Explained variance ratio:\t\t", np.round(
    pca.explained_variance_ratio_, 3))
print("Cumulative explained variance ratio:\t", np.round(
    np.cumsum(pca.explained_variance_ratio_), 3))

Number of principal components:		 70
Explained variance:			 [28.265  9.376  4.982  3.808  3.298  3.164  2.329  2.189  2.149  1.964
  1.941  1.887  1.798  1.76   1.631  1.578  1.549  1.5    1.461  1.412
  1.389  1.327  1.299  1.26   1.24   1.214  1.172  1.167  1.146  1.135
  1.111  1.099  1.066  1.061  1.052  1.033  1.029  1.024  1.02   1.01
  1.004  1.003  1.     0.998  0.996  0.994  0.993  0.988  0.981  0.98
  0.973  0.967  0.949  0.925  0.918  0.879  0.853  0.83   0.812  0.809
  0.783  0.768  0.728  0.713  0.701  0.693  0.674  0.663  0.659  0.62 ]
Explained variance ratio:		 [0.213 0.07  0.037 0.029 0.025 0.024 0.018 0.016 0.016 0.015 0.015 0.014
 0.014 0.013 0.012 0.012 0.012 0.011 0.011 0.011 0.01  0.01  0.01  0.009
 0.009 0.009 0.009 0.009 0.009 0.009 0.008 0.008 0.008 0.008 0.008 0.008
 0.008 0.008 0.008 0.008 0.008 0.008 0.008 0.008 0.007 0.007 0.007 0.007
 0.007 0.007 0.007 0.007 0.007 0.007 0.007 0.007 0.006 0.006 0.006 0.006
 0.006 0.006 0.005 0.005 0.005 0.005 0.005 0.005 0.

In [20]:
### Let's Use PCA & To Perform Random Forests
## Try PCA on blocks of similar features.
## Kernel PCA - non linear transformations.
## CNN / RNN

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
X_pca , y, test_size = 0.2, random_state= 3)

In [33]:
RFR.fit(X_train, y_train)

Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5,PC_6,PC_7,PC_8,PC_9,PC_10,PC_11,PC_12,PC_13,PC_14,PC_15,PC_16,PC_17,PC_18,PC_19,PC_20,PC_21,PC_22,PC_23,PC_24,PC_25,PC_26,PC_27,PC_28,PC_29,PC_30,PC_31,PC_32,PC_33,PC_34,PC_35,PC_36,PC_37,PC_38,PC_39,PC_40,PC_41,PC_42,PC_43,PC_44,PC_45,PC_46,PC_47,PC_48,PC_49,PC_50,PC_51,PC_52,PC_53,PC_54,PC_55,PC_56,PC_57,PC_58,PC_59,PC_60,PC_61,PC_62,PC_63,PC_64,PC_65,PC_66,PC_67,PC_68,PC_69,PC_70
13859,-2.335220,0.685009,0.114445,-0.092741,-0.677838,0.301709,0.677657,-0.997388,2.188818,-1.331021,1.596747,1.930868,-1.696282,-1.513572,-0.277060,-0.464676,0.939656,-1.381589,0.060557,-0.031424,-0.668536,-0.567230,0.982894,-0.149458,0.427058,-0.117717,2.010359,0.033530,-0.701089,-0.125432,1.038550,-0.562997,0.592791,0.633118,-0.164796,0.407541,0.101462,0.051008,-0.054995,-0.292913,-0.110792,-0.068833,0.001465,0.214133,0.042000,-0.019331,-0.163267,0.118601,-0.211848,0.217600,-0.567011,-0.271568,-0.770188,-0.077164,0.081415,0.687994,0.081836,0.375831,0.115849,0.226664,0.230556,1.268187,0.055710,0.276974,0.193845,-0.679945,0.238233,0.334541,-0.133111,-0.197899
12594,-0.203226,-0.244530,-0.010400,-0.871768,-0.548294,0.258493,-0.834177,0.694731,0.419650,0.177717,-0.765338,-0.967930,-0.854762,-0.144047,-0.198620,0.963035,-0.897992,0.181588,0.146427,0.255248,-0.172034,-0.654905,0.149642,-1.474254,1.658035,0.328164,-0.524502,0.371014,0.464121,-0.112701,-0.942600,-0.589481,-0.387541,-0.432125,-0.762295,0.215378,0.282189,-0.060275,0.188163,0.086294,0.164592,-0.333893,0.125140,-0.543120,-0.096803,-0.143782,0.479362,0.426329,0.054364,-0.229740,-0.366200,-0.053686,0.513765,0.691612,-0.062690,0.414008,1.058250,-0.758377,-0.143862,-0.106905,0.430679,-0.440355,-0.102928,0.106971,0.311777,0.576840,-0.167220,-0.103774,-0.787951,0.319070
5721,-4.336454,0.827345,-0.222152,-0.521516,-0.087890,0.616449,-1.358976,0.989334,-0.304566,-0.298887,0.078059,-0.379065,-0.429979,1.200600,-0.031192,0.139840,0.175653,-0.506792,0.007791,-0.432648,0.994298,-1.024503,0.170837,0.015497,0.010768,-1.369039,-0.689412,-0.333281,0.142475,1.218135,0.787944,0.372432,-1.470889,-0.716988,0.395685,0.161553,0.009429,0.424295,-0.501177,-0.582726,-0.088883,0.193119,-0.121457,0.124008,-0.061892,-0.029877,-0.429684,0.370341,-0.198618,0.870651,-0.310403,0.477619,1.037334,0.393353,0.072310,0.404071,1.660129,-1.040414,-0.446636,0.875244,0.132347,0.697683,-0.152958,0.979845,0.658476,0.654405,0.042578,0.165360,-1.192311,-0.132612
13797,-0.682158,-0.453200,-0.604947,-0.427660,-0.854314,0.447341,1.483037,-0.378689,0.659514,0.318415,-0.864406,0.013310,0.237277,-1.005566,0.190932,-0.957552,0.675892,-0.955150,-0.122081,0.675742,-0.540036,-0.227939,0.601952,-0.501973,0.894821,-1.167377,1.592275,-0.626236,-0.169254,0.486291,0.152043,-0.744214,0.400918,0.315237,-0.123761,0.814103,0.261765,-0.114260,0.118798,-0.353864,-0.130260,0.081654,-0.123872,-0.147758,-0.072810,-0.359122,-0.056459,-0.055749,-0.087572,0.101572,-0.362936,-0.083835,-0.309421,0.998742,-0.058402,0.636329,0.898961,0.372729,-0.224160,0.040737,-0.036582,0.041309,-1.274278,-0.514619,0.213266,-0.400159,-0.061703,-0.026494,-0.240602,0.238658
7187,0.883191,-0.628970,-0.138745,-0.395949,-0.677243,-0.077889,0.888217,-1.471141,0.984280,-0.270836,-1.357445,0.152312,-0.107274,0.722425,0.480471,-0.035477,-0.317813,-0.279551,0.543515,1.223268,1.008940,0.101920,-0.805361,0.819000,0.709610,0.044344,0.701638,-1.327349,2.647716,1.191227,0.068937,1.562730,1.154481,0.495875,0.156253,-0.148350,-0.040957,0.193543,0.807422,0.095047,0.064899,-0.390533,-0.220598,-0.361735,0.158634,-0.485644,0.019696,-1.247037,0.086159,-0.751369,0.028243,-0.184335,0.451988,0.332487,-0.453526,-0.236066,0.329388,0.221456,-0.174150,0.249712,0.001412,-0.515614,0.343971,-0.918222,-0.179663,0.516171,0.418448,-0.734821,0.988377,-0.845567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9160,-4.416079,1.102799,0.436600,-0.791648,-0.575724,1.183348,0.768142,-0.203186,0.468879,0.783771,1.528678,1.270099,0.491761,-2.063891,0.512719,2.472426,-1.012928,2.932259,1.281606,0.615624,0.865094,0.459470,-0.023149,3.177853,-1.053403,-1.009497,-0.054285,-0.551881,-2.155714,0.518893,-0.221223,1.547251,0.376715,-0.407531,-0.338794,0.598454,0.278187,0.488633,-0.856310,-0.403254,0.033942,0.122073,-0.363691,0.461777,-0.128597,-0.249251,-0.454100,0.523028,-0.448149,0.028782,-0.135844,0.093811,-0.827509,-0.746869,-0.103725,0.021518,-0.067840,-0.524328,0.211730,0.981039,-0.076735,-0.853488,-0.194710,-0.196350,-0.135176,0.363231,0.124654,0.356271,0.059960,0.328724
9859,-2.143880,1.665801,0.789821,-0.785610,-0.264490,0.315530,-1.434967,1.036741,0.211630,-0.072298,0.021842,-0.632155,-0.391582,1.179187,-0.316772,1.086784,-0.615991,-0.429507,0.335412,0.316083,0.445233,0.977906,-0.196570,1.177182,1.695521,-0.565488,0.819372,1.752352,-0.278349,-0.019043,-0.011587,0.135022,0.070378,1.018500,1.061700,0.737859,-0.363005,-1.427690,-0.026007,0.537002,-0.286080,0.285465,0.143285,0.068058,-0.259054,0.058897,0.247133,-0.537154,-0.357864,-0.092401,1.247357,-0.719840,2.124715,0.678066,0.824284,1.346624,-0.038305,-0.089391,1.016015,-0.235862,0.542302,-0.295606,0.110578,-0.618428,-0.551237,-0.264587,-0.104423,-0.828094,1.074946,-0.050614
11513,-4.353440,0.785000,-0.129447,-0.396874,-0.123801,0.258691,0.418181,-0.448721,-0.086363,0.861847,0.429708,0.359742,0.283324,-0.558545,0.165460,0.618874,0.188215,0.454787,0.647286,-0.795850,-0.399853,-0.733307,1.593840,-0.994778,0.086231,0.662734,-0.115547,-0.068034,0.012924,-0.208671,-0.229994,-0.409795,-0.362885,-0.044451,0.393454,-0.249141,-0.171972,-0.028928,-0.161670,-0.050623,0.094978,-0.116626,0.322804,-0.093403,0.003318,0.054572,0.374344,0.256900,0.299293,0.430790,0.174351,-0.208778,0.225815,-0.167810,-0.335766,-0.876843,-0.038946,-0.060393,-0.166232,-0.246560,0.123113,-0.412905,0.127192,0.239575,-0.649260,0.835131,0.086955,0.029982,-0.924371,-0.061468
1688,9.967544,8.214670,1.054073,-2.097108,-2.994836,-2.888766,0.485652,-1.254194,-0.467541,2.066309,-0.241233,-3.060516,-0.178853,0.968432,0.782858,0.609045,0.725832,-0.490874,0.877850,-0.767645,-0.884070,0.053656,-0.809207,-0.541787,-2.428091,-2.726630,-1.463370,0.249584,-0.929942,-0.961187,-0.423190,-2.915337,-1.011262,0.070861,-0.508757,-0.546097,0.360496,0.701149,0.688232,0.712132,0.050156,0.002385,0.105798,0.204756,0.440941,1.497607,-0.143219,0.439241,-0.164971,0.481825,0.119024,0.451948,-1.420612,-3.437087,1.066155,0.635461,-1.217289,-1.873153,-1.262062,0.028157,-0.452046,0.795572,0.032284,0.134588,0.909998,-1.645018,0.058352,1.425435,-0.092605,-1.080604
