In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

# packages to Scrape the data
from bs4 import BeautifulSoup
import requests

# packages for weekday/weekend features extraction
import re
import datetime as dt

# packages for Polarity & Subjectivity features extraction
from textblob import TextBlob

# packages for NER features extraction
from collections import Counter
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from spacy import displacy
import en_core_web_sm

from BasicFeatures import BasicFeaturesCreator as bfc
from KeywordsPopularity import KeyPop

import pickle
import joblib

import string
from nltk.corpus import stopwords
from ONPdoc2vec import ONPd2v

In [2]:
# Display all columns and rows in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Load the url

In [3]:
url_id = 10 
test_url = 'http://mashable.com/2014/09/01/cuba-restrictions/'

In [4]:
df = pd.DataFrame( columns=["Id", "url"])
df = df.append( [{'Id':url_id, 'url':test_url}])
df

Unnamed: 0,Id,url
0,10,http://mashable.com/2014/09/01/cuba-restrictions/


# 2. Data Scrapping

In [5]:
# the contents containing repeating / unnecessary info - these classes are excluded
exclude_class_list = [ "top-stories-promo-story__summary"]

exclude_starts_with = ["Additional reporting by"]
regex_keyword = """<meta content="(?P<keyword1>[^><\/\"]*)"\s[a-zA-Z="\-]*\sname="keywords"\s+\/+>"""

# Header info
header_info = pd.DataFrame(columns=["Id"])

In [6]:
# for first time creation
df_scrap = pd.DataFrame(columns=["Id", "url", "title", "content", "html"])
out_index = 0

# In case there are more than 1 url
for index, rows in df.iterrows():
    doc_id = df.at[ index, "Id"]
    url = df.at[ index, "url"]
    
    resp = requests.get(url)

    if resp.status_code != 200:
        print("Issue in status code : " + str(doc_id)+ " : "+ url + " : " + str(resp.status_code))
        continue

    html_doc = resp.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    content = "" # soup.title.string + "\n"

    for p in soup.select( "p"):
        text = p.get_text()

        if len( text.split()) > 1:
            if text not in content:
                is_in_exclude_list = False

                for exclude_class in exclude_class_list:
                    if p.has_attr("class") and \
                        exclude_class in p.get_attribute_list( "class"):

                        is_in_exclude_list = True
                        break

                for starts_string in exclude_starts_with: 
                    if text.startswith(starts_string):
                        is_in_exclude_list = True
                        break

                if not is_in_exclude_list:
                    content = content + text + "\n"

    title_string = soup.title.string

    df_scrap = df_scrap.append( [{"Id":doc_id, "url":url, "title":title_string, 
                              "html":html_doc, "content":content}])
    out_index += 1


In [7]:
df_scrap.head()

Unnamed: 0,Id,url,title,content,html
0,10,http://mashable.com/2014/09/01/cuba-restrictions/,Cuba Cracks Down on What Travelers Can Bring I...,"Over the last five years, travel restrictions ...",<!DOCTYPE html>\n<html data-env='production' l...


# 3. Generate Features

### 0. Extract Keywords which will be used for some of features creation

In [8]:
regex_keyword = """<meta content="(?P<keyword1>[^><\/\"]*)"\s[a-zA-Z="\-]*\sname="keywords"\s+\/+>"""

df_keyword = df_scrap[["Id"]]
df_keyword['keywords'] = ''

for index, row in df_keyword.iterrows():
    id = row.Id
    
    html_doc = df_scrap.loc[0,'html']
    
    keywords = ""
    match = re.search(regex_keyword, html_doc)
    if match:
        if not match.group("keyword1") == None:
            keywords = match.group("keyword1")
              
    df_keyword.iloc[index, 1] = keywords
print(keywords)

cuba, uncategorized, us, world, lifestyle, travel-leisure


### 1. Day of the week or weekend features

In [9]:
# Fetch date from url
df_days = df_scrap.copy()

day_columns = ['is__Monday', 'is__Tuesday', 'is__Wednesday', 'is__Thursday', 'is__Friday', 'is__Saturday', 'is__Sunday', 'is_Weekend']
df_dates = pd.DataFrame(np.zeros((1, 8), int), columns = day_columns)
df_dates['Id'] = df_days['Id']
df_days = pd.merge(df_days, df_dates, on='Id')

df_days['date']=df_days['url'].apply(lambda x: dt.datetime.strptime(re.search("\d{4}/\d{2}/\d{2}", x).group(), '%Y/%m/%d').date())
df_days['date'] = pd.to_datetime(df_days['date'])

# Fetch day of the week 
df_days['day_of_week'] = df_days['date'].dt.day_name()
df_days['is__' + df_days['day_of_week']] = 1
df_days.drop(['url', 'title', 'html', 'content', 'date', 'day_of_week'], axis = 1, inplace = True)

# Create if day if a weekend feature
def weekend(row):
    if (row['is__Saturday'] == 1) | (row['is__Sunday'] == 1):
        return 1
    else:
        return 0

df_days['is_Weekend'] = df_days.apply(weekend, axis=1) 

df_days.head()

Unnamed: 0,Id,is__Monday,is__Tuesday,is__Wednesday,is__Thursday,is__Friday,is__Saturday,is__Sunday,is_Weekend
0,10,1,0,0,0,0,0,0,0


### 2. Polarity features

In [10]:
df_pol=df_scrap.copy()
df_pol.head()

Unnamed: 0,Id,url,title,content,html
0,10,http://mashable.com/2014/09/01/cuba-restrictions/,Cuba Cracks Down on What Travelers Can Bring I...,"Over the last five years, travel restrictions ...",<!DOCTYPE html>\n<html data-env='production' l...


In [11]:
# line by line iterate through all documents
df_pol['polarity_title']=0
df_pol['subjectivity_title']=0
df_pol['polarity_content']=0
df_pol['subjectivity_content']=0

startTimeModule = dt.datetime.now()
for idx, row in df_pol.iterrows():
    #if idx>100: #exit condition
    #    break
    title=TextBlob(str(row['title']))
    content=TextBlob(str(row['content']))
    
    df_pol.loc[idx,'polarity_title']=title.sentiment.polarity
    df_pol.loc[idx,'subjectivity_title']=title.sentiment.subjectivity
    
    df_pol.loc[idx,'polarity_content']=content.sentiment.polarity
    df_pol.loc[idx,'subjectivity_content']=content.sentiment.subjectivity

print ('Sentiment Polarity & Subjectivity extraction time:',(dt.datetime.now() - startTimeModule))

Sentiment Polarity & Subjectivity extraction time: 0:00:00.082779


In [12]:
df_pol.drop(['url', 'title', 'content', 'html'], axis=1,inplace=True)
df_pol.head()

Unnamed: 0,Id,polarity_title,subjectivity_title,polarity_content,subjectivity_content
0,10,-0.155556,0.288889,0.055053,0.435264


### 3. LDA, NMF and LSI features 

In [13]:
NUM_TOPICS=5

In [14]:
df_lda_nmf_lsi=df_scrap.copy()
df_lda_nmf_lsi.head()

Unnamed: 0,Id,url,title,content,html
0,10,http://mashable.com/2014/09/01/cuba-restrictions/,Cuba Cracks Down on What Travelers Can Bring I...,"Over the last five years, travel restrictions ...",<!DOCTYPE html>\n<html data-env='production' l...


In [15]:
titles=df_lda_nmf_lsi['title'].values.astype('U')
titles=titles.tolist()
titles=list(filter(None,titles))
titles

['Cuba Cracks Down on What Travelers Can Bring Into the Country']

In [16]:
content=df_lda_nmf_lsi['content'].values.astype('U')
content=content.tolist()
content=list(filter(None,content))
#content

In [17]:
keywords=df_keyword['keywords'].values.astype('U')
keywords=keywords.tolist()
keywords=list(filter(None,keywords))
keywords

['cuba, uncategorized, us, world, lifestyle, travel-leisure']

In [18]:
vectorizerT = joblib.load(open('../data/output/models/vectorizerT.pkl', "rb"))
vectorizerC = joblib.load(open('../data/output/models/vectorizerC.pkl', "rb"))
vectorizerK = joblib.load(open('../data/output/models/vectorizerK.pkl', "rb"))

title_vectorized = vectorizerT.transform(titles)
content_vectorized = vectorizerC.transform(content)
keyword_vectorized = vectorizerK.transform(keywords)

In [19]:
# load and transform LDA
lda_modelT = joblib.load(open('../data/output/models/lda_modelT.pkl', "rb"))
lda_modelC = joblib.load(open('../data/output/models/lda_modelC.pkl', "rb"))
lda_modelK = joblib.load(open('../data/output/models/lda_modelK.pkl', "rb"))

lda_title = lda_modelT.transform(title_vectorized)
lda_content = lda_modelC.transform(content_vectorized)
lda_keyword = lda_modelK.transform(keyword_vectorized)

In [20]:
# load and transform NMF
nmf_modelT = joblib.load(open('../data/output/models/nmf_modelT.pkl', "rb"))
nmf_modelC = joblib.load(open('../data/output/models/nmf_modelC.pkl', "rb"))
nmf_modelK = joblib.load(open('../data/output/models/nmf_modelK.pkl', "rb"))

nmf_title = nmf_modelT.transform(title_vectorized)
nmf_content = nmf_modelC.transform(content_vectorized)
nmf_keyword = nmf_modelK.transform(keyword_vectorized)

In [21]:
# load and transform LSI
lsi_modelT = joblib.load(open('../data/output/models/lsi_modelT.pkl', "rb"))
lsi_modelC = joblib.load(open('../data/output/models/lsi_modelC.pkl', "rb"))
lsi_modelK = joblib.load(open('../data/output/models/lsi_modelK.pkl', "rb"))

lsi_title = lsi_modelT.transform(title_vectorized)
lsi_content = lsi_modelC.transform(content_vectorized)
lsi_keyword = lsi_modelK.transform(keyword_vectorized)

In [22]:
models=[]
models.append(('LDA_Title',lda_title))
models.append(('NMF_Title',nmf_title))
models.append(('LSI_Title',lsi_title))
models.append(('LDA_Content',lda_content))
models.append(('NMF_Content',nmf_content))
models.append(('LSI_Content',lsi_content))
models.append(('LDA_Keyword',lda_keyword))
models.append(('NMF_Keyword',nmf_keyword))
models.append(('LSI_Keyword',lsi_keyword))

In [23]:
for name, model in models:
    cols=[]
    for i in range(NUM_TOPICS):
        cols.append(name[:5]+str(i))
    tmp=pd.DataFrame(model,columns=cols)
    tmp['Id']=df_scrap['Id']
    df_lda_nmf_lsi=pd.merge(df_lda_nmf_lsi,tmp,on='Id',how='left')

In [24]:
df_lda_nmf_lsi.drop(['url', 'title', 'content', 'html'], axis=1,inplace=True)
df_lda_nmf_lsi.head(2)

Unnamed: 0,Id,LDA_T0,LDA_T1,LDA_T2,LDA_T3,LDA_T4,NMF_T0,NMF_T1,NMF_T2,NMF_T3,NMF_T4,LSI_T0,LSI_T1,LSI_T2,LSI_T3,LSI_T4,LDA_C0,LDA_C1,LDA_C2,LDA_C3,LDA_C4,NMF_C0,NMF_C1,NMF_C2,NMF_C3,NMF_C4,LSI_C0,LSI_C1,LSI_C2,LSI_C3,LSI_C4,LDA_K0,LDA_K1,LDA_K2,LDA_K3,LDA_K4,NMF_K0,NMF_K1,NMF_K2,NMF_K3,NMF_K4,LSI_K0,LSI_K1,LSI_K2,LSI_K3,LSI_K4
0,10,0.041426,0.838571,0.040001,0.040001,0.040001,0.000214,0.002627,0.000171,0.005098,0.001038,0.005416,0.012549,0.003715,0.020846,-0.001443,0.00052,0.667497,0.271204,0.039268,0.021512,0.120748,0.055163,0.440346,0.005522,0.0115,9.671483,-2.640793,-2.7058,-4.723799,-0.407795,0.040067,0.43966,0.440043,0.04,0.04023,0.079872,0.006842,0.003658,0.00055,0.023077,0.641962,-0.069053,0.019139,-0.02275,0.158595


### 4. NER features extraction

In [25]:
nlp = en_core_web_sm.load()

df_ner = df_scrap.copy()
df_ner["sent_list"] = None
df_ner["refined_content"] = None
df_ner["NER_list"] = None
df_ner["NER_most_common"] = None

df_ner.head()

Unnamed: 0,Id,url,title,content,html,sent_list,refined_content,NER_list,NER_most_common
0,10,http://mashable.com/2014/09/01/cuba-restrictions/,Cuba Cracks Down on What Travelers Can Bring I...,"Over the last five years, travel restrictions ...",<!DOCTYPE html>\n<html data-env='production' l...,,,,


In [26]:
ignored_sent_count = 0

for index, row in df_ner.iterrows():
    sent_list = []
    
    d = str( row.content)
    d = d.replace(".\n", ". ")
    d = d.replace(".\r", ". ")
    d = d.replace("\n", ". ")
    d = d.replace("\r", ". ")
    sent = sent_tokenize(d)
    
    sent = [ s for s in sent if s != "." ] # remove sentenances with only a dot
    
    # Ignore Non-english sentenances
    # Sentenances with more than 50% of unicode chars are ignored
    for each_sent in sent:
        
        if each_sent.startswith( "{\"player\":{\"description\":"):
            continue
        
        non_english_count = 0
        for c in each_sent:
            if ord(c) > 255:
                non_english_count += 1

        if len(each_sent) > 2 and \
            non_english_count > len(each_sent)/2:

            # ignore this sentenance
            ignored_sent_count += 1
        else:
            sent_list.append( each_sent)
            
    
    refined_content = ""
    for sent in sent_list:
        if len( refined_content) > 0:
            refined_content = refined_content + " "
        
        refined_content = refined_content + sent
    
    df_ner.at[index, 'sent_list'] = sent_list
    df_ner.at[index, 'refined_content'] = refined_content   


In [27]:
ner_num_cols = ['NER_GPE', 'NER_DATE', 'NER_CARDINAL', 'NER_NORP', 'NER_PERSON', 'NER_TIME', 'NER_ORG', 'NER_WORK_OF_ART',
                'NER_QUANTITY','NER_EVENT', 'NER_ORDINAL', 'NER_MONEY', 'NER_FAC', 'NER_PRODUCT', 'NER_LAW', 
                'NER_PERCENT', 'NER_LOC', 'NER_LANGUAGE']
df_ner_num_temp = pd.DataFrame(np.zeros((1, 18)), columns = ner_num_cols)
df_ner_num_temp['Id'] = df_scrap['Id']
df_ner = pd.merge(df_ner, df_ner_num_temp, on='Id')

In [28]:
NER_labels = set()
for index, row in df_ner.iterrows():
    sample_doc = df_ner.at[index, 'refined_content']
    sample_doc = nlp(sample_doc)
    
    NER_labels.update( [x.label_ for x in sample_doc.ents])
    
    counter = Counter( [x.label_ for x in sample_doc.ents])
    
    for label in counter:
        df_ner.at[index, "NER_" + label] = counter[label]
        
    # extract indiviudal NER entities
    counter = Counter([ent.text for ent in sample_doc.ents])
    NER_list = [text for text in counter]
    NER_most_common = [text[0] for text in counter.most_common(10)]
    
    df_ner.at[index, "NER_list"] = ",".join(NER_list)
    df_ner.at[index, "NER_most_common"] = ",".join(NER_most_common)
        
# set NaNs in df to 0
for label in NER_labels:
    df_ner["NER_" + label].fillna(0, inplace=True)    


In [29]:
df_ner_num = df_ner.drop( ['url', 'title', 'content', 'html', 'sent_list', 'refined_content', 'NER_list', 'NER_most_common'], axis=1)
df_ner_num.head()

Unnamed: 0,Id,NER_GPE,NER_DATE,NER_CARDINAL,NER_NORP,NER_PERSON,NER_TIME,NER_ORG,NER_WORK_OF_ART,NER_QUANTITY,NER_EVENT,NER_ORDINAL,NER_MONEY,NER_FAC,NER_PRODUCT,NER_LAW,NER_PERCENT,NER_LOC,NER_LANGUAGE
0,10,16.0,11.0,13.0,13.0,5.0,0.0,4.0,2.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0


### 5. Few more NER features extraction

In [30]:
df_ner2_temp = df_ner.loc[:, ['Id', 'NER_list', 'NER_most_common']]
df_ner2_temp['keywords'] = df_keyword['keywords']
df_ner2_temp.head()

Unnamed: 0,Id,NER_list,NER_most_common,keywords
0,10,"the last five years,the United States,Cuba,as ...","Cuba,Cuban,Miami,two,Cubans,Monday,recent days...","cuba, uncategorized, us, world, lifestyle, tra..."


In [31]:
column_list= ['keywords', 'NER_list', 'NER_most_common']

size = 100
pca_count = 10
df_ner_keyword_list_common = df[["Id"]]

for column in column_list:
    
    df_ner2 = df_ner2_temp.loc[:, ['Id', column]]
    
    # Load the model
    d2v_model = pickle.load(open(r"..\data\output\models\d2v_"+column+".model", "rb"))
    
    all_columns_list = [ column + "_" + str(i) for i in range(size)]
    pca_columns_list = [ column + "_pca_" + str(i) for i in range(pca_count)]
    
    df_ner2_all = df_ner2[["Id"]]
    for pca_column in pca_columns_list:
        df_ner2_all[ pca_column] = 0.0

    for all_column in all_columns_list:
        df_ner2_all[ all_column] = 0.0
    
    for index, row in df_ner2_all.iterrows():
        pca_values = d2v_model.infer_vector_pca( df_ner2.at[ index, column])
        for i in range(pca_count):
            pca_column = column + "_pca_" + str(i)
            df_ner2_all.at[ index, pca_column] = pca_values[i]

        all_values = d2v_model.infer_vector( df_ner2.at[ index, column])
        for i in range(size):
            all_column = column + "_" + str(i)
            df_ner2_all.at[ index, all_column] = all_values[i]
    
    df_ner_keyword_list_common = pd.merge(df_ner_keyword_list_common, df_ner2_all, on='Id')
    

In [32]:
df_ner_keyword_list_common.head()

Unnamed: 0,Id,keywords_pca_0,keywords_pca_1,keywords_pca_2,keywords_pca_3,keywords_pca_4,keywords_pca_5,keywords_pca_6,keywords_pca_7,keywords_pca_8,keywords_pca_9,keywords_0,keywords_1,keywords_2,keywords_3,keywords_4,keywords_5,keywords_6,keywords_7,keywords_8,keywords_9,keywords_10,keywords_11,keywords_12,keywords_13,keywords_14,keywords_15,keywords_16,keywords_17,keywords_18,keywords_19,keywords_20,keywords_21,keywords_22,keywords_23,keywords_24,keywords_25,keywords_26,keywords_27,keywords_28,keywords_29,keywords_30,keywords_31,keywords_32,keywords_33,keywords_34,keywords_35,keywords_36,keywords_37,keywords_38,keywords_39,keywords_40,keywords_41,keywords_42,keywords_43,keywords_44,keywords_45,keywords_46,keywords_47,keywords_48,keywords_49,keywords_50,keywords_51,keywords_52,keywords_53,keywords_54,keywords_55,keywords_56,keywords_57,keywords_58,keywords_59,keywords_60,keywords_61,keywords_62,keywords_63,keywords_64,keywords_65,keywords_66,keywords_67,keywords_68,keywords_69,keywords_70,keywords_71,keywords_72,keywords_73,keywords_74,keywords_75,keywords_76,keywords_77,keywords_78,keywords_79,keywords_80,keywords_81,keywords_82,keywords_83,keywords_84,keywords_85,keywords_86,keywords_87,keywords_88,keywords_89,keywords_90,keywords_91,keywords_92,keywords_93,keywords_94,keywords_95,keywords_96,keywords_97,keywords_98,keywords_99,NER_list_pca_0,NER_list_pca_1,NER_list_pca_2,NER_list_pca_3,NER_list_pca_4,NER_list_pca_5,NER_list_pca_6,NER_list_pca_7,NER_list_pca_8,NER_list_pca_9,NER_list_0,NER_list_1,NER_list_2,NER_list_3,NER_list_4,NER_list_5,NER_list_6,NER_list_7,NER_list_8,NER_list_9,NER_list_10,NER_list_11,NER_list_12,NER_list_13,NER_list_14,NER_list_15,NER_list_16,NER_list_17,NER_list_18,NER_list_19,NER_list_20,NER_list_21,NER_list_22,NER_list_23,NER_list_24,NER_list_25,NER_list_26,NER_list_27,NER_list_28,NER_list_29,NER_list_30,NER_list_31,NER_list_32,NER_list_33,NER_list_34,NER_list_35,NER_list_36,NER_list_37,NER_list_38,NER_list_39,NER_list_40,NER_list_41,NER_list_42,NER_list_43,NER_list_44,NER_list_45,NER_list_46,NER_list_47,NER_list_48,NER_list_49,NER_list_50,NER_list_51,NER_list_52,NER_list_53,NER_list_54,NER_list_55,NER_list_56,NER_list_57,NER_list_58,NER_list_59,NER_list_60,NER_list_61,NER_list_62,NER_list_63,NER_list_64,NER_list_65,NER_list_66,NER_list_67,NER_list_68,NER_list_69,NER_list_70,NER_list_71,NER_list_72,NER_list_73,NER_list_74,NER_list_75,NER_list_76,NER_list_77,NER_list_78,NER_list_79,NER_list_80,NER_list_81,NER_list_82,NER_list_83,NER_list_84,NER_list_85,NER_list_86,NER_list_87,NER_list_88,NER_list_89,NER_list_90,NER_list_91,NER_list_92,NER_list_93,NER_list_94,NER_list_95,NER_list_96,NER_list_97,NER_list_98,NER_list_99,NER_most_common_pca_0,NER_most_common_pca_1,NER_most_common_pca_2,NER_most_common_pca_3,NER_most_common_pca_4,NER_most_common_pca_5,NER_most_common_pca_6,NER_most_common_pca_7,NER_most_common_pca_8,NER_most_common_pca_9,NER_most_common_0,NER_most_common_1,NER_most_common_2,NER_most_common_3,NER_most_common_4,NER_most_common_5,NER_most_common_6,NER_most_common_7,NER_most_common_8,NER_most_common_9,NER_most_common_10,NER_most_common_11,NER_most_common_12,NER_most_common_13,NER_most_common_14,NER_most_common_15,NER_most_common_16,NER_most_common_17,NER_most_common_18,NER_most_common_19,NER_most_common_20,NER_most_common_21,NER_most_common_22,NER_most_common_23,NER_most_common_24,NER_most_common_25,NER_most_common_26,NER_most_common_27,NER_most_common_28,NER_most_common_29,NER_most_common_30,NER_most_common_31,NER_most_common_32,NER_most_common_33,NER_most_common_34,NER_most_common_35,NER_most_common_36,NER_most_common_37,NER_most_common_38,NER_most_common_39,NER_most_common_40,NER_most_common_41,NER_most_common_42,NER_most_common_43,NER_most_common_44,NER_most_common_45,NER_most_common_46,NER_most_common_47,NER_most_common_48,NER_most_common_49,NER_most_common_50,NER_most_common_51,NER_most_common_52,NER_most_common_53,NER_most_common_54,NER_most_common_55,NER_most_common_56,NER_most_common_57,NER_most_common_58,NER_most_common_59,NER_most_common_60,NER_most_common_61,NER_most_common_62,NER_most_common_63,NER_most_common_64,NER_most_common_65,NER_most_common_66,NER_most_common_67,NER_most_common_68,NER_most_common_69,NER_most_common_70,NER_most_common_71,NER_most_common_72,NER_most_common_73,NER_most_common_74,NER_most_common_75,NER_most_common_76,NER_most_common_77,NER_most_common_78,NER_most_common_79,NER_most_common_80,NER_most_common_81,NER_most_common_82,NER_most_common_83,NER_most_common_84,NER_most_common_85,NER_most_common_86,NER_most_common_87,NER_most_common_88,NER_most_common_89,NER_most_common_90,NER_most_common_91,NER_most_common_92,NER_most_common_93,NER_most_common_94,NER_most_common_95,NER_most_common_96,NER_most_common_97,NER_most_common_98,NER_most_common_99
0,10,-4.390699,-0.481457,1.103631,0.448338,1.034572,0.034248,-0.737114,0.102756,0.517266,0.117024,-0.002244,-0.00348,-0.003433,-0.001274,-0.000388,-0.001013,0.002754,0.001351,0.00111,0.001075,-0.006459,5.9e-05,0.00739,0.004669,0.003466,-0.000862,0.004827,0.001696,-0.002811,-0.004023,-0.002459,0.002962,-0.002221,-0.006448,-4.2e-05,-0.005483,3.4e-05,-8.2e-05,-0.002784,0.002032,0.003502,-0.008894,-0.001117,-0.000232,0.000938,0.003496,-0.000253,0.001662,0.005689,0.002415,-0.004387,-0.004293,-0.008464,0.006051,-0.00233,-0.001028,-0.001035,-0.000671,-0.002104,0.005886,0.004236,0.004396,-0.001156,-0.001534,-0.001548,-0.001516,0.000268,-0.002986,0.003198,0.002155,0.000108,-0.004032,-0.005231,-0.00136,0.000146,0.006338,6.4e-05,-0.008803,0.000939,0.003408,0.001083,-0.000109,-0.002974,0.005612,-0.002336,-0.001044,0.003492,0.000634,-0.005463,-0.004194,0.001799,-0.00069,-0.004199,-0.002323,-0.003424,-0.000249,0.0059,-5.8e-05,-0.002938,3.1e-05,-0.00151,0.0025,0.007084,-0.003406,0.00183,0.001158,0.002496,-0.004693,0.00528,0.001992,38.270031,1.228388,1.873711,0.016183,-0.119524,-0.905342,0.717487,0.60638,0.890784,0.527589,0.088612,0.045442,-0.157501,-0.019259,-0.054413,0.030879,0.102351,-0.084786,0.057594,-0.001581,0.018039,0.054913,-0.040126,-0.044942,-0.050893,-0.052698,0.012489,0.03952,0.066454,-0.027454,-0.007305,0.041969,0.021993,-0.002804,-0.012246,-0.002525,-0.007359,0.025612,-0.017015,0.025265,-0.053178,0.028309,0.006301,0.052204,-0.042238,0.034265,-0.064297,-0.011057,0.125023,0.048556,-0.100968,0.029567,-0.063681,0.072369,0.053247,0.037009,0.000129,-0.014661,-0.050709,-0.013191,0.026592,-0.009894,-0.037257,-0.040796,-0.077459,0.037693,0.129846,-0.020339,0.024291,0.045863,-0.006545,-0.035789,0.021275,0.009701,-0.061859,0.032424,0.077737,-0.056691,0.036868,0.025067,-0.001433,0.024106,0.014204,0.029163,-0.027725,-0.090843,-0.045196,-0.026453,0.059695,0.028214,0.002366,0.026813,0.047856,0.07314,-0.026957,-0.057208,0.101262,0.095265,-0.00982,0.010591,-0.07452,-0.017581,-0.02786,-0.099695,0.050692,-0.014567,0.041803,0.061844,0.044704,-0.036647,2.550587,4.912198,0.1637,-1.037501,-0.319436,0.570994,-0.461211,0.548656,-0.618925,-1.358165,0.00687,-0.004149,-0.002243,-0.004518,-0.001426,0.004714,-0.003337,-0.000414,0.002059,0.00536,-0.000879,0.001392,-0.002117,0.001045,-0.004812,0.003285,0.000816,-0.000666,0.00118,0.001878,0.000721,0.00012,0.003683,0.00039,0.000408,-0.000856,0.001424,0.002587,-0.003412,0.000183,0.002333,0.001236,0.003074,0.004901,0.00039,0.003028,-0.000753,-0.002524,0.00246,0.001177,-0.002465,-0.003603,-0.005411,-4e-06,0.002653,0.001665,-0.003092,0.002233,-0.000374,0.003099,0.001594,-0.002531,-0.003736,0.003242,8e-05,0.001193,0.003696,0.003928,0.003449,0.000743,0.00314,0.000606,-0.001108,-0.004531,4.3e-05,0.004016,-0.003574,0.002612,-0.00361,0.001772,0.004902,-0.002416,-0.004257,-0.00253,-0.002346,-0.001042,-0.001051,-0.003427,-0.002191,0.006911,-0.004177,0.003408,0.003396,-0.001161,-0.003369,-0.004518,0.003576,-0.000995,-0.003357,-0.00359,-0.002298,0.000649,-0.003059,-0.005989,0.002784,-0.00146,-0.000584,0.004929,0.005671,-0.003219


In [33]:
df_ner_keyword_list_common_pcas = df_ner_keyword_list_common.loc[:,df_ner_keyword_list_common.columns.str.contains("pca")]
df_ner_keyword_list_common_pcas['Id'] = df_ner_keyword_list_common['Id']
df_ner_keyword_list_common_pcas

Unnamed: 0,keywords_pca_0,keywords_pca_1,keywords_pca_2,keywords_pca_3,keywords_pca_4,keywords_pca_5,keywords_pca_6,keywords_pca_7,keywords_pca_8,keywords_pca_9,NER_list_pca_0,NER_list_pca_1,NER_list_pca_2,NER_list_pca_3,NER_list_pca_4,NER_list_pca_5,NER_list_pca_6,NER_list_pca_7,NER_list_pca_8,NER_list_pca_9,NER_most_common_pca_0,NER_most_common_pca_1,NER_most_common_pca_2,NER_most_common_pca_3,NER_most_common_pca_4,NER_most_common_pca_5,NER_most_common_pca_6,NER_most_common_pca_7,NER_most_common_pca_8,NER_most_common_pca_9,Id
0,-4.390699,-0.481457,1.103631,0.448338,1.034572,0.034248,-0.737114,0.102756,0.517266,0.117024,38.270031,1.228388,1.873711,0.016183,-0.119524,-0.905342,0.717487,0.60638,0.890784,0.527589,2.550587,4.912198,0.1637,-1.037501,-0.319436,0.570994,-0.461211,0.548656,-0.618925,-1.358165,10


### 5. Clustering the text features

In [34]:
#df_clus = df_scrap.copy()
clus_columns = ['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9']
df_clus = pd.DataFrame(np.zeros((1, 10), int), columns = clus_columns)
df_clus['Id'] = df_scrap['Id']
df_clus.head()

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,Id
0,0,0,0,0,0,0,0,0,0,0,10


In [35]:
# load TfIdf Vectorizer and KMeans clustering Model
vectorizerKmeans = joblib.load(open('../data/output/models/vectorizer_Tfidf.pkl', "rb"))
modelKmeans = joblib.load(open('../data/output/models/Clustering.pkl', "rb"))

In [36]:
# Prediction
Y = vectorizerKmeans.transform([df_scrap.loc[0,'content']])
prediction = modelKmeans.predict(Y)
print(prediction[0])

0


In [37]:
df_clus['cluster_' + str(prediction[0])] = 1
df_clus.head()

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,Id
0,1,0,0,0,0,0,0,0,0,0,10


### 6. Extract basic features which are given in input dataset for project

In [38]:
df_basic = pd.DataFrame(columns=["Id", "n_tokens_title", "n_tokens_content", "n_unique_tokens", \
                                   "n_non_stop_words", "n_non_stop_unique_tokens", "num_hrefs", \
                                   "num_self_hrefs", "num_imgs", "num_videos", "average_token_length", \
                                   "num_keywords", "data_channel_is_lifestyle", \
                                   "data_channel_is_entertainment", "data_channel_is_bus", \
                                   "data_channel_is_socmed", "data_channel_is_tech", "data_channel_is_world"])

df_basic = df_basic.astype( {"Id":int, "n_tokens_title":int, "n_tokens_content":int, "n_unique_tokens":float, \
                         "n_non_stop_words":float, "n_non_stop_unique_tokens":float, "num_hrefs":int, \
                         "num_self_hrefs":int, "num_imgs":int, "num_videos":int, "average_token_length":float, \
                         "num_keywords":int, "data_channel_is_lifestyle":int, "data_channel_is_entertainment":int, \
                         "data_channel_is_bus":int, "data_channel_is_socmed":int, "data_channel_is_tech":int, \
                         "data_channel_is_world":int})

df_basic['Id'] = df_scrap['Id']
#df_basic

In [39]:
result = bfc.get_basic_features(df_scrap.loc[0,'html'])
df_basic.loc[0] = [doc_id, result.n_tokens_title, result.n_tokens_content, result.n_unique_tokens, \
                           result.n_non_stop_words, result.n_non_stop_unique_tokens, result.num_hrefs, \
                           result.num_self_hrefs, result.num_imgs, result.num_videos, result.average_token_length, \
                           result.num_keywords, result.data_channel_is_lifestyle, \
                           result.data_channel_is_entertainment, result.data_channel_is_bus, \
                           result.data_channel_is_socmed, result.data_channel_is_tech, result.data_channel_is_world \
                    ]
df_basic

Unnamed: 0,Id,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world
0,10,11.0,915.0,0.455738,0.519126,0.741007,3.0,2.0,1.0,0.0,4.271038,6.0,0.0,0.0,0.0,0.0,0.0,1.0


### 7. Extract features with keywords popularity

In [40]:
keyPop = KeyPop()

In [41]:
df_kw = pd.DataFrame(columns=["Id", "kw_avg_avg", "kw_min_avg", "kw_max_avg", \
                                   "kw_avg_avg_no_clip", "kw_min_avg_no_clip", "kw_max_avg_no_clip"])
df_kw.astype({"Id":int, "kw_avg_avg":int, "kw_min_avg":int, "kw_max_avg":int, \
                "kw_avg_avg_no_clip":int, "kw_min_avg_no_clip":int, "kw_max_avg_no_clip":int})

predicted_shares = keyPop.predict_shares(df_keyword.loc[0,'keywords'])

df_kw.at[index, "Id"] = row.Id
df_kw.at[index, "kw_avg_avg"] = predicted_shares.avg_avg
df_kw.at[index, "kw_min_avg"] = predicted_shares.min_avg
df_kw.at[index, "kw_max_avg"] = predicted_shares.max_avg
df_kw.at[index, "kw_avg_avg_no_clip"] = predicted_shares.avg_avg_no_clip
df_kw.at[index, "kw_min_avg_no_clip"] = predicted_shares.min_avg_no_clip
df_kw.at[index, "kw_max_avg_no_clip"] = predicted_shares.max_avg_no_clip

df_kw

Unnamed: 0,Id,kw_avg_avg,kw_min_avg,kw_max_avg,kw_avg_avg_no_clip,kw_min_avg_no_clip,kw_max_avg_no_clip
0,10,2307,2185,2495,3083,2495,3410


### 8. All content Features

In [42]:
cached_stopwords=stopwords.words('english')

punc=string.punctuation
table=str.maketrans('','',string.punctuation)

def prep(text):
    #word tokenization
    tokens=nltk.word_tokenize(text)
    
    #converting to lower case
    tokens=[t.lower() for t in tokens]
    
    #removing punctuations
    tokens=[t.translate(table) for t in tokens]
    
    #filter tokens by length (minimum 3 chars)
    tokens=[t for t in tokens if len(t)>2]
    
    #remove stopwords
    tokens=[t for t in tokens if t not in cached_stopwords]
    
    return ",".join( tokens)

In [43]:
size = 100
pca_count = 10
column = 'refined_content_1'
df_ner["refined_content_1"] = df_ner.refined_content.apply( prep)

In [44]:
#df_ner.head()

In [45]:
d2v_model = pickle.load(open(r"..\data\output\models\d2v_content.model", "rb"))

all_columns_list = [ "content_" + str(i) for i in range(size)]
pca_columns_list = [ "content_pca_" + str(i) for i in range(pca_count)]

df_content = df_ner[["Id"]]
for pca_column in pca_columns_list:
    df_content[ pca_column] = 0.0

for all_column in all_columns_list:
    df_content[ all_column] = 0.0

for index, row in df_content.iterrows():
    pca_values = d2v_model.infer_vector_pca( df_ner.at[ index, column])
    for i in range(pca_count):
        pca_column = "content_pca_" + str(i)
        df_content.at[ index, pca_column] = pca_values[i]

    all_values = d2v_model.infer_vector( df_ner.at[ index, column])
    for i in range(size):
        all_column = "content_" + str(i)
        df_content.at[ index, all_column] = all_values[i]
        
    if index % 100 == 0:
        print( str(index) + ", ", end='')

df_content.head()

0, 

Unnamed: 0,Id,content_pca_0,content_pca_1,content_pca_2,content_pca_3,content_pca_4,content_pca_5,content_pca_6,content_pca_7,content_pca_8,content_pca_9,content_0,content_1,content_2,content_3,content_4,content_5,content_6,content_7,content_8,content_9,content_10,content_11,content_12,content_13,content_14,content_15,content_16,content_17,content_18,content_19,content_20,content_21,content_22,content_23,content_24,content_25,content_26,content_27,content_28,content_29,content_30,content_31,content_32,content_33,content_34,content_35,content_36,content_37,content_38,content_39,content_40,content_41,content_42,content_43,content_44,content_45,content_46,content_47,content_48,content_49,content_50,content_51,content_52,content_53,content_54,content_55,content_56,content_57,content_58,content_59,content_60,content_61,content_62,content_63,content_64,content_65,content_66,content_67,content_68,content_69,content_70,content_71,content_72,content_73,content_74,content_75,content_76,content_77,content_78,content_79,content_80,content_81,content_82,content_83,content_84,content_85,content_86,content_87,content_88,content_89,content_90,content_91,content_92,content_93,content_94,content_95,content_96,content_97,content_98,content_99
0,10,15.235851,6.005808,-7.438265,-16.322971,17.376694,-1.587944,-2.763834,-2.713079,-5.892924,-0.120658,0.126504,-0.262181,-0.497229,-0.103619,-0.584295,-0.153096,0.06271,0.142093,0.188903,-0.369799,-0.248739,-0.410971,0.436424,0.50902,-0.781772,-0.356774,-0.042309,0.390412,-0.015537,0.126832,0.331055,-0.265436,0.353812,-0.08864,0.106417,-0.514666,-0.139841,0.256202,-0.123017,-0.066617,-0.420626,-0.275621,-0.484492,-0.401166,0.306389,0.044395,0.469876,-0.201489,0.005371,0.332954,0.220656,0.277647,-0.603126,0.438686,-0.151344,-0.678495,-0.140634,-0.38528,-0.258056,0.600185,0.618451,-0.103624,-0.145315,-0.611945,-0.771133,-0.493118,0.13549,0.095751,-0.778572,-0.493542,-0.005836,0.001721,-0.17549,0.530303,0.479038,0.783231,0.602023,-0.251755,0.976732,0.032368,0.026122,-0.175549,-0.003999,-0.656204,0.601399,-0.070719,0.656877,0.476816,0.500586,0.218917,-0.389606,0.844601,-0.16952,0.58863,0.062518,-0.067242,-0.231214,0.173306,-1.279615,0.509098,-0.238956,-0.817717,0.078247,-0.06767,0.011189,-0.721657,-0.011463,0.021397,-0.556588,-0.267797


# 4. Combine all the features for the test url

In [46]:
# Merge all the features to calculate the average shares predicted
df_final = pd.merge(df_basic, df_days, on='Id' )
df_final = pd.merge(df_final, df_pol, on='Id')
df_final = pd.merge(df_final, df_lda_nmf_lsi, on='Id')
df_final = pd.merge(df_final, df_ner_keyword_list_common_pcas, on='Id')
df_final = pd.merge(df_final, df_ner_num, on='Id')
df_final = pd.merge(df_final, df_content, on='Id')
df_final = pd.merge(df_final, df_clus, on='Id')
df_final = pd.merge(df_final, df_kw, on='Id')
df_final.head()

Unnamed: 0,Id,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,is__Monday,is__Tuesday,is__Wednesday,is__Thursday,is__Friday,is__Saturday,is__Sunday,is_Weekend,polarity_title,subjectivity_title,polarity_content,subjectivity_content,LDA_T0,LDA_T1,LDA_T2,LDA_T3,LDA_T4,NMF_T0,NMF_T1,NMF_T2,NMF_T3,NMF_T4,LSI_T0,LSI_T1,LSI_T2,LSI_T3,LSI_T4,LDA_C0,LDA_C1,LDA_C2,LDA_C3,LDA_C4,NMF_C0,NMF_C1,NMF_C2,NMF_C3,NMF_C4,LSI_C0,LSI_C1,LSI_C2,LSI_C3,LSI_C4,LDA_K0,LDA_K1,LDA_K2,LDA_K3,LDA_K4,NMF_K0,NMF_K1,NMF_K2,NMF_K3,NMF_K4,LSI_K0,LSI_K1,LSI_K2,LSI_K3,LSI_K4,keywords_pca_0,keywords_pca_1,keywords_pca_2,keywords_pca_3,keywords_pca_4,keywords_pca_5,keywords_pca_6,keywords_pca_7,keywords_pca_8,keywords_pca_9,NER_list_pca_0,NER_list_pca_1,NER_list_pca_2,NER_list_pca_3,NER_list_pca_4,NER_list_pca_5,NER_list_pca_6,NER_list_pca_7,NER_list_pca_8,NER_list_pca_9,NER_most_common_pca_0,NER_most_common_pca_1,NER_most_common_pca_2,NER_most_common_pca_3,NER_most_common_pca_4,NER_most_common_pca_5,NER_most_common_pca_6,NER_most_common_pca_7,NER_most_common_pca_8,NER_most_common_pca_9,NER_GPE,NER_DATE,NER_CARDINAL,NER_NORP,NER_PERSON,NER_TIME,NER_ORG,NER_WORK_OF_ART,NER_QUANTITY,NER_EVENT,NER_ORDINAL,NER_MONEY,NER_FAC,NER_PRODUCT,NER_LAW,NER_PERCENT,NER_LOC,NER_LANGUAGE,content_pca_0,content_pca_1,content_pca_2,content_pca_3,content_pca_4,content_pca_5,content_pca_6,content_pca_7,content_pca_8,content_pca_9,content_0,content_1,content_2,content_3,content_4,content_5,content_6,content_7,content_8,content_9,content_10,content_11,content_12,content_13,content_14,content_15,content_16,content_17,content_18,content_19,content_20,content_21,content_22,content_23,content_24,content_25,content_26,content_27,content_28,content_29,content_30,content_31,content_32,content_33,content_34,content_35,content_36,content_37,content_38,content_39,content_40,content_41,content_42,content_43,content_44,content_45,content_46,content_47,content_48,content_49,content_50,content_51,content_52,content_53,content_54,content_55,content_56,content_57,content_58,content_59,content_60,content_61,content_62,content_63,content_64,content_65,content_66,content_67,content_68,content_69,content_70,content_71,content_72,content_73,content_74,content_75,content_76,content_77,content_78,content_79,content_80,content_81,content_82,content_83,content_84,content_85,content_86,content_87,content_88,content_89,content_90,content_91,content_92,content_93,content_94,content_95,content_96,content_97,content_98,content_99,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,kw_avg_avg,kw_min_avg,kw_max_avg,kw_avg_avg_no_clip,kw_min_avg_no_clip,kw_max_avg_no_clip
0,10,11.0,915.0,0.455738,0.519126,0.741007,3.0,2.0,1.0,0.0,4.271038,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0,-0.155556,0.288889,0.055053,0.435264,0.041426,0.838571,0.040001,0.040001,0.040001,0.000214,0.002627,0.000171,0.005098,0.001038,0.005416,0.012549,0.003715,0.020846,-0.001443,0.00052,0.667497,0.271204,0.039268,0.021512,0.120748,0.055163,0.440346,0.005522,0.0115,9.671483,-2.640793,-2.7058,-4.723799,-0.407795,0.040067,0.43966,0.440043,0.04,0.04023,0.079872,0.006842,0.003658,0.00055,0.023077,0.641962,-0.069053,0.019139,-0.02275,0.158595,-4.390699,-0.481457,1.103631,0.448338,1.034572,0.034248,-0.737114,0.102756,0.517266,0.117024,38.270031,1.228388,1.873711,0.016183,-0.119524,-0.905342,0.717487,0.60638,0.890784,0.527589,2.550587,4.912198,0.1637,-1.037501,-0.319436,0.570994,-0.461211,0.548656,-0.618925,-1.358165,16.0,11.0,13.0,13.0,5.0,0.0,4.0,2.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,15.235851,6.005808,-7.438265,-16.322971,17.376694,-1.587944,-2.763834,-2.713079,-5.892924,-0.120658,0.126504,-0.262181,-0.497229,-0.103619,-0.584295,-0.153096,0.06271,0.142093,0.188903,-0.369799,-0.248739,-0.410971,0.436424,0.50902,-0.781772,-0.356774,-0.042309,0.390412,-0.015537,0.126832,0.331055,-0.265436,0.353812,-0.08864,0.106417,-0.514666,-0.139841,0.256202,-0.123017,-0.066617,-0.420626,-0.275621,-0.484492,-0.401166,0.306389,0.044395,0.469876,-0.201489,0.005371,0.332954,0.220656,0.277647,-0.603126,0.438686,-0.151344,-0.678495,-0.140634,-0.38528,-0.258056,0.600185,0.618451,-0.103624,-0.145315,-0.611945,-0.771133,-0.493118,0.13549,0.095751,-0.778572,-0.493542,-0.005836,0.001721,-0.17549,0.530303,0.479038,0.783231,0.602023,-0.251755,0.976732,0.032368,0.026122,-0.175549,-0.003999,-0.656204,0.601399,-0.070719,0.656877,0.476816,0.500586,0.218917,-0.389606,0.844601,-0.16952,0.58863,0.062518,-0.067242,-0.231214,0.173306,-1.279615,0.509098,-0.238956,-0.817717,0.078247,-0.06767,0.011189,-0.721657,-0.011463,0.021397,-0.556588,-0.267797,1,0,0,0,0,0,0,0,0,0,2307,2185,2495,3083,2495,3410


In [47]:
df_final.shape

(1, 249)

In [48]:
#Selected features through ensemble feature significance scoring
f_feature_ranking=open('../data/output/Feature_ranking_Selected.txt')
selected_features = f_feature_ranking.read().split('\n')
selected_features.remove('Id')
selected_features.remove('shares')
print(len(selected_features))
selected_features

106


['n_tokens_title',
 'n_tokens_content',
 'n_unique_tokens',
 'n_non_stop_words',
 'n_non_stop_unique_tokens',
 'num_hrefs',
 'num_self_hrefs',
 'num_imgs',
 'num_videos',
 'average_token_length',
 'num_keywords',
 'data_channel_is_lifestyle',
 'data_channel_is_bus',
 'data_channel_is_socmed',
 'data_channel_is_world',
 'is__Monday',
 'is__Tuesday',
 'is__Wednesday',
 'is__Thursday',
 'is__Friday',
 'is__Sunday',
 'is_Weekend',
 'kw_avg_avg',
 'kw_min_avg',
 'kw_max_avg',
 'cluster_1',
 'cluster_2',
 'cluster_3',
 'cluster_4',
 'cluster_5',
 'cluster_7',
 'cluster_8',
 'subjectivity_title',
 'subjectivity_content',
 'keywords_pca_0',
 'keywords_pca_1',
 'keywords_pca_2',
 'keywords_pca_3',
 'keywords_pca_4',
 'keywords_pca_5',
 'keywords_pca_6',
 'keywords_pca_7',
 'keywords_pca_8',
 'keywords_pca_9',
 'content_pca_0',
 'content_pca_1',
 'content_pca_2',
 'content_pca_3',
 'content_pca_4',
 'content_pca_5',
 'content_pca_6',
 'content_pca_7',
 'content_pca_8',
 'content_pca_9',
 'NER_li

In [49]:
df_selected=df_final[selected_features]

In [50]:
df_selected

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_bus,data_channel_is_socmed,data_channel_is_world,is__Monday,is__Tuesday,is__Wednesday,is__Thursday,is__Friday,is__Sunday,is_Weekend,kw_avg_avg,kw_min_avg,kw_max_avg,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_7,cluster_8,subjectivity_title,subjectivity_content,keywords_pca_0,keywords_pca_1,keywords_pca_2,keywords_pca_3,keywords_pca_4,keywords_pca_5,keywords_pca_6,keywords_pca_7,keywords_pca_8,keywords_pca_9,content_pca_0,content_pca_1,content_pca_2,content_pca_3,content_pca_4,content_pca_5,content_pca_6,content_pca_7,content_pca_8,content_pca_9,NER_list_pca_0,NER_list_pca_1,NER_list_pca_2,NER_list_pca_3,NER_list_pca_4,NER_list_pca_5,NER_list_pca_6,NER_list_pca_7,NER_list_pca_8,NER_list_pca_9,NER_GPE,NER_DATE,NER_CARDINAL,NER_NORP,NER_PERSON,NER_TIME,NER_ORG,NER_WORK_OF_ART,NER_QUANTITY,NER_EVENT,NER_ORDINAL,NER_MONEY,NER_FAC,NER_PRODUCT,NER_LAW,NER_PERCENT,NER_LOC,NER_LANGUAGE,LDA_T4,NMF_T2,LSI_T0,LSI_T1,LSI_T4,LDA_C0,LDA_C1,LDA_C2,NMF_C3,LSI_C0,LSI_C1,LSI_C2,LSI_C3,LSI_C4,LDA_K0,LDA_K2,LDA_K4,NMF_K0,NMF_K1,NMF_K2,NMF_K3,NMF_K4,LSI_K0,LSI_K1
0,11.0,915.0,0.455738,0.519126,0.741007,3.0,2.0,1.0,0.0,4.271038,6.0,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,2307,2185,2495,0,0,0,0,0,0,0,0.288889,0.435264,-4.390699,-0.481457,1.103631,0.448338,1.034572,0.034248,-0.737114,0.102756,0.517266,0.117024,15.235851,6.005808,-7.438265,-16.322971,17.376694,-1.587944,-2.763834,-2.713079,-5.892924,-0.120658,38.270031,1.228388,1.873711,0.016183,-0.119524,-0.905342,0.717487,0.60638,0.890784,0.527589,16.0,11.0,13.0,13.0,5.0,0.0,4.0,2.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.040001,0.000171,0.005416,0.012549,-0.001443,0.00052,0.667497,0.271204,0.005522,9.671483,-2.640793,-2.7058,-4.723799,-0.407795,0.040067,0.440043,0.04023,0.079872,0.006842,0.003658,0.00055,0.023077,0.641962,-0.069053


# 5. Scaling the features

In [51]:
scale_cols = joblib.load(open('../data/output/models/scale_cols.txt', "rb"))
feature_scaler = joblib.load(open('../data/output/models/feature_scaler.pkl', "rb"))

In [52]:
scale_cols

['n_tokens_title',
 'n_tokens_content',
 'num_hrefs',
 'num_self_hrefs',
 'num_imgs',
 'num_videos',
 'num_keywords',
 'kw_avg_avg',
 'kw_min_avg',
 'kw_max_avg',
 'NER_GPE',
 'NER_DATE',
 'NER_CARDINAL',
 'NER_NORP',
 'NER_PERSON',
 'NER_TIME',
 'NER_ORG',
 'NER_WORK_OF_ART',
 'NER_QUANTITY',
 'NER_EVENT',
 'NER_ORDINAL',
 'NER_MONEY',
 'NER_FAC',
 'NER_PRODUCT',
 'NER_LAW',
 'NER_PERCENT',
 'NER_LOC',
 'NER_LANGUAGE']

In [53]:
df_selected[scale_cols]= feature_scaler.transform(df_selected[scale_cols])

In [54]:
df_selected

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_bus,data_channel_is_socmed,data_channel_is_world,is__Monday,is__Tuesday,is__Wednesday,is__Thursday,is__Friday,is__Sunday,is_Weekend,kw_avg_avg,kw_min_avg,kw_max_avg,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_7,cluster_8,subjectivity_title,subjectivity_content,keywords_pca_0,keywords_pca_1,keywords_pca_2,keywords_pca_3,keywords_pca_4,keywords_pca_5,keywords_pca_6,keywords_pca_7,keywords_pca_8,keywords_pca_9,content_pca_0,content_pca_1,content_pca_2,content_pca_3,content_pca_4,content_pca_5,content_pca_6,content_pca_7,content_pca_8,content_pca_9,NER_list_pca_0,NER_list_pca_1,NER_list_pca_2,NER_list_pca_3,NER_list_pca_4,NER_list_pca_5,NER_list_pca_6,NER_list_pca_7,NER_list_pca_8,NER_list_pca_9,NER_GPE,NER_DATE,NER_CARDINAL,NER_NORP,NER_PERSON,NER_TIME,NER_ORG,NER_WORK_OF_ART,NER_QUANTITY,NER_EVENT,NER_ORDINAL,NER_MONEY,NER_FAC,NER_PRODUCT,NER_LAW,NER_PERCENT,NER_LOC,NER_LANGUAGE,LDA_T4,NMF_T2,LSI_T0,LSI_T1,LSI_T4,LDA_C0,LDA_C1,LDA_C2,NMF_C3,LSI_C0,LSI_C1,LSI_C2,LSI_C3,LSI_C4,LDA_K0,LDA_K2,LDA_K4,NMF_K0,NMF_K1,NMF_K2,NMF_K3,NMF_K4,LSI_K0,LSI_K1
0,-0.010755,0.493599,0.455738,0.519126,0.741007,-0.565694,-0.364473,0.089655,-0.437768,4.271038,-0.515274,0.0,0.0,0.0,1.0,1,0,0,0,0,0,0,0.188787,1.109849,-0.368028,0,0,0,0,0,0,0,0.288889,0.435264,-4.390699,-0.481457,1.103631,0.448338,1.034572,0.034248,-0.737114,0.102756,0.517266,0.117024,15.235851,6.005808,-7.438265,-16.322971,17.376694,-1.587944,-2.763834,-2.713079,-5.892924,-0.120658,38.270031,1.228388,1.873711,0.016183,-0.119524,-0.905342,0.717487,0.60638,0.890784,0.527589,1.034733,0.37603,1.370871,2.878193,-0.418088,-0.565614,-0.583772,0.449966,1.180395,-0.29375,-0.576602,1.208661,-0.405032,-0.454842,-0.194825,1.264534,-0.388596,-0.117931,0.040001,0.000171,0.005416,0.012549,-0.001443,0.00052,0.667497,0.271204,0.005522,9.671483,-2.640793,-2.7058,-4.723799,-0.407795,0.040067,0.440043,0.04023,0.079872,0.006842,0.003658,0.00055,0.023077,0.641962,-0.069053


# 6.  Predict the number of Shares

In [55]:
models_name_list = joblib.load(open('../data/output/models/models_name_list.txt', "rb")) 
models = []
for name in models_name_list:
    models.append((name, joblib.load(open('../data/output/models/'+name+'.pkl',"rb"))))    



In [56]:
prediction_list = []
for name, model in models:
    prediction=int(model.predict(df_selected.values))
    prediction_list.append(prediction)
    print("prediction by {} is {}: ".format(name, prediction))

prediction by EN is 2599: 
prediction by GBM is 2223: 
prediction by XGB is 2273: 


In [57]:
# Ensemble of models
print("Ensemble of models prediction ", int(np.mean(prediction_list)))

Ensemble of models prediction  2365
