#### Recreate Basic Features

The following features are already given in OnlineNewsPopularity data file. 
This notebook tries to recreate them as much as possible

* n_tokens_title
* n_tokens_content
* n_unique_tokens
* n_non_stop_words
* n_non_stop_unique_tokens
* num_hrefs
* num_self_hrefs
* num_imgs
* num_videos
* average_token_length
* num_keywords
* data_channel_is_lifestyle
* data_channel_is_entertainment
* data_channel_is_bus
* data_channel_is_socmed
* data_channel_is_tech
* data_channel_is_world

In [1]:
import pandas as pd
import numpy as np
import re
import requests

In [2]:
# For NLP actions
from bs4 import BeautifulSoup
from bs4 import element

import spacy
nlp = spacy.load('en_core_web_sm')

from collections import Counter

In [3]:
df_in = pd.read_excel('../data/output/SS_Extracted_content.xlsx')[["Id"]]
df_in = df_in[ df_in.Id < 20]
df_in.shape

(19, 1)

In [4]:
doc_id = df_in.Id[13]

with open( "../data/output/html/" + str(doc_id) + ".html", "r", encoding="utf-8", errors='ignore') as html_file:
    html_doc = html_file.read()

print( len(html_doc))
print( html_doc[:100])
print( ".....")
print( html_doc[-100:])


80949
<!DOCTYPE html>
<html data-env='production' lang='en' xml:lang='en'>
<head>
<script>
  window.__o = 
.....
 document.getElementsByTagName('body')[0]).appendChild(s);
    })();
  }
</script>

</body>
</html>



In [5]:
df_out = pd.DataFrame(columns=["Id", "n_tokens_title", "n_tokens_content", "n_unique_tokens", \
                               "n_non_stop_words", "n_non_stop_unique_tokens", "num_hrefs", \
                               "num_self_hrefs", "num_imgs", "num_videos", "average_token_length", \
                               "num_keywords", "data_channel_is_lifestyle", \
                               "data_channel_is_entertainment", "data_channel_is_bus", \
                               "data_channel_is_socmed", "data_channel_is_tech", "data_channel_is_world"])
df_out = df_out.astype( {"Id":int, "n_tokens_title":int, "n_tokens_content":int, "n_unique_tokens":int, \
                         "n_non_stop_words":int, "n_non_stop_unique_tokens":int, "num_hrefs":int, \
                         "num_self_hrefs":int, "num_imgs":int, "num_videos":int, "average_token_length":int, \
                         "num_keywords":int, "data_channel_is_lifestyle":int, "data_channel_is_entertainment":int, \
                         "data_channel_is_bus":int, "data_channel_is_socmed":int, "data_channel_is_tech":int, \
                         "data_channel_is_world":int})

df_out.dtypes

Id                               int32
n_tokens_title                   int32
n_tokens_content                 int32
n_unique_tokens                  int32
n_non_stop_words                 int32
n_non_stop_unique_tokens         int32
num_hrefs                        int32
num_self_hrefs                   int32
num_imgs                         int32
num_videos                       int32
average_token_length             int32
num_keywords                     int32
data_channel_is_lifestyle        int32
data_channel_is_entertainment    int32
data_channel_is_bus              int32
data_channel_is_socmed           int32
data_channel_is_tech             int32
data_channel_is_world            int32
dtype: object

In [6]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
# Calculate title features
title = str( soup.title.string)
nlp_title = nlp(title)

n_tokens_title = 0
for token in nlp_title:
    n_tokens_title += 1
    
print( title)
print( "n_tokens_title : " , n_tokens_title)

Google Glass is Getting a Second Look from Businesses
n_tokens_title :  9


### Extract content and other features

In [8]:
# the contents containing repeating / unnecessary info - these classes are excluded
exclude_class_list = [ "top-stories-promo-story__summary"]

exclude_starts_with = ["Additional reporting by"]
regex_keyword = """<meta content="(?P<keyword1>[^><\/\"]*)"\s[a-zA-Z="\-]*\sname="keywords".?.?>"""

content = "" # soup.title.string + "\n"

num_hrefs = 0
num_self_hrefs = 0
num_imgs = 0
num_videos = 0

for p in soup.select( "p"):
    for child in p.children:
        if isinstance( child, element.Tag):
            type_of_tag = child.name
            
            if not child.name is None:
                if child.name == "a":
                    num_hrefs += 1
                    
                    href_url = child.get('href')
                    if (not href_url is None) and ("mashable.com" in href_url.lower()):
                        num_self_hrefs += 1
                        
                if child.name == "img" or child.name == "picture":
                    num_imgs += 1
                    
                if child.name == "iframe":
                    if "youtube.com" in str( child.get("src")):
                        num_videos += 1
                    
    text = p.get_text()

    if len( text.split()) > 1:
        if text not in content:
            is_in_exclude_list = False

            for exclude_class in exclude_class_list:
                if p.has_attr("class") and \
                    exclude_class in p.get_attribute_list( "class"):

                    is_in_exclude_list = True
                    break

            for starts_string in exclude_starts_with: 
                if text.startswith(starts_string):
                    is_in_exclude_list = True
                    break

            if not is_in_exclude_list:
                content = content + text + "\n"

print( "num_hrefs : ", num_hrefs)
print( "num_self_hrefs : ", num_self_hrefs)
print( "num_imgs : ", num_imgs)
print( "num_videos : ", num_videos)

num_hrefs :  12
num_self_hrefs :  5
num_imgs :  1
num_videos :  1


In [9]:
nlp_content = nlp(content)

tokens = [token.text for token in nlp_content]
                #if not token.is_stop and not token.is_punct]

n_tokens_content = len(tokens)

print( "n_tokens_content : ", n_tokens_content)

average_token_length = 0
for token in tokens:
    average_token_length += len(token)

average_token_length = average_token_length / n_tokens_content

print( "average_token_length : ", average_token_length)

n_tokens_content :  1899
average_token_length :  4.20115850447604


In [10]:
count_unique_tokens = len( Counter(tokens))
n_unique_tokens = len( Counter(tokens)) / n_tokens_content
print( "n_unique_tokens :", n_unique_tokens)

n_unique_tokens : 0.3644023170089521


In [11]:
tokens = [token.text for token in nlp_content if not token.is_stop and not token.is_punct]

n_non_stop_words = len(tokens) / n_tokens_content

print( "n_non_stop_words : ", n_non_stop_words)

n_non_stop_words :  0.4333859926276988


In [12]:
n_non_stop_unique_tokens = len( Counter(tokens)) / count_unique_tokens
print( "n_non_stop_unique_tokens :", n_non_stop_unique_tokens)

n_non_stop_unique_tokens : 0.7326589595375722


In [13]:
# Keywords
regex_keyword = """<meta content="(?P<keyword1>[^><\/\"]*)"\s[a-zA-Z="\-]*\sname="keywords".?.?>"""

keywords = ""

for meta in soup.find_all( 'meta'):
    meta_text = str(meta)
    match = re.search( regex_keyword, meta_text)
    if match:
        if not match.group("keyword1") == None:
            keywords = match.group("keyword1")
            break
            
# if keywords is not found, take non-stop words from title
if keywords == "":
    tokens = [token.text for token in nlp_title if not token.is_stop and not token.is_punct]
    keywords = ", ".join( tokens)

print( keywords)

num_keywords = len( keywords.split(", "))
print( "num_keywords : ", num_keywords)

google, funding, startups, enterprise, uncategorized, business, gadgets, google-glass
num_keywords :  8


In [15]:
# Data channel
data_channel = ""
for p in soup.select( "hgroup"):
    if not p.get("data-channel") == None:
        data_channel = p.get("data-channel")
        
print(data_channel)

data_channel = data_channel.lower()

data_channel_is_lifestyle = 1 if data_channel == "culture" or data_channel == "" else 0
data_channel_is_entertainment = 1 if data_channel == "entertainment" else 0
data_channel_is_bus = 1 if data_channel == "business" else 0
data_channel_is_socmed = 1 if data_channel == "social-good" else 0
data_channel_is_tech = 1 if data_channel == "tech" else 0
data_channel_is_world = 1 if data_channel == "world" or data_channel == "u.s." else 0

business


In [16]:
print( "n_tokens_title",  n_tokens_title)
print( "n_tokens_content", n_tokens_content)
print( "n_unique_tokens", n_unique_tokens)
print( "n_non_stop_words", n_non_stop_words)
print( "n_non_stop_unique_tokens", n_non_stop_unique_tokens)
print( "num_hrefs", num_hrefs)
print( "num_self_hrefs", num_self_hrefs)
print( "num_imgs", num_imgs)
print( "num_videos", num_videos)
print( "average_token_length", average_token_length)
print( "num_keywords", num_keywords)
print( "data_channel_is_lifestyle", data_channel_is_lifestyle)
print( "data_channel_is_entertainment", data_channel_is_entertainment)
print( "data_channel_is_bus", data_channel_is_bus)
print( "data_channel_is_socmed", data_channel_is_socmed)
print( "data_channel_is_tech", data_channel_is_tech)
print( "data_channel_is_world", data_channel_is_world)

n_tokens_title 9
n_tokens_content 1899
n_unique_tokens 0.3644023170089521
n_non_stop_words 0.4333859926276988
n_non_stop_unique_tokens 0.7326589595375722
num_hrefs 12
num_self_hrefs 5
num_imgs 1
num_videos 1
average_token_length 4.20115850447604
num_keywords 8
data_channel_is_lifestyle 0
data_channel_is_entertainment 0
data_channel_is_bus 1
data_channel_is_socmed 0
data_channel_is_tech 0
data_channel_is_world 0


In [None]:
# Get the data_channel for all the files

df_in = pd.read_excel('../data/output/SS_Extracted_content.xlsx')[["Id"]]

df_out = pd.DataFrame(columns=["Id", "data_channel"])

for index, row in df_in.iterrows():
    doc_id = row.Id

    with open( "../data/output/html/" + str(doc_id) + ".html", "r", encoding="utf-8", errors='ignore') as html_file:
        html_doc = html_file.read()
    
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    data_channel = ""
    for p in soup.select( "hgroup"):
        if not p.get("data-channel") == None:
            data_channel = p.get("data-channel")
    
    df_out.loc[index] = [doc_id, data_channel]
    
    if index % 100 == 0:
        print( index, ", ", end="")
    
df_out.to_excel('../data/output/0_data_channels.xlsx', index=False)
print()

### Validate the basic features creator class

In [1]:
from BasicFeatures import BasicFeaturesCreator as bfc

In [3]:
html_content = ""
with open( "../data/output/html/" + str(5935) + ".html", "r", encoding="utf-8", errors='ignore') as html_file:
    html_content = html_file.read()

len(html_content)

100033

In [4]:
result = bfc.get_basic_features( html_content)
print( "n_tokens_title",  result.n_tokens_title)
print( "n_tokens_content", result.n_tokens_content)
print( "n_unique_tokens", result.n_unique_tokens)
print( "n_non_stop_words", result.n_non_stop_words)
print( "n_non_stop_unique_tokens", result.n_non_stop_unique_tokens)
print( "num_hrefs", result.num_hrefs)
print( "num_self_hrefs", result.num_self_hrefs)
print( "num_imgs", result.num_imgs)
print( "num_videos", result.num_videos)
print( "average_token_length", result.average_token_length)
print( "num_keywords", result.num_keywords)
print( "data_channel_is_lifestyle", result.data_channel_is_lifestyle)
print( "data_channel_is_entertainment", result.data_channel_is_entertainment)
print( "data_channel_is_bus", result.data_channel_is_bus)
print( "data_channel_is_socmed", result.data_channel_is_socmed)
print( "data_channel_is_tech", result.data_channel_is_tech)
print( "data_channel_is_world", result.data_channel_is_world)

n_tokens_title 9
n_tokens_content 1820
n_unique_tokens 0.4098901098901099
n_non_stop_words 0.4604395604395604
n_non_stop_unique_tokens 0.7734584450402144
num_hrefs 23
num_self_hrefs 2
num_imgs 5
num_videos 0
average_token_length 4.246703296703297
num_keywords 6
data_channel_is_lifestyle 1
data_channel_is_entertainment 0
data_channel_is_bus 0
data_channel_is_socmed 0
data_channel_is_tech 0
data_channel_is_world 0


#### Recreate Basic features of all the entries

In [3]:
import pandas as pd
from os import path

from BasicFeatures import BasicFeaturesCreator as bfc

In [5]:
df_in = pd.read_excel('../data/input/OnlineNewsPopularity.xlsx')[["Id", " shares"]]

df_out = None
out_filename = '../data/output/0_recreated_basic_features.xlsx'

if path.exists( out_filename):
    df_out = pd.read_excel( out_filename)
else:
    df_out = pd.DataFrame(columns=["Id", "n_tokens_title", "n_tokens_content", "n_unique_tokens", \
                                   "n_non_stop_words", "n_non_stop_unique_tokens", "num_hrefs", \
                                   "num_self_hrefs", "num_imgs", "num_videos", "average_token_length", \
                                   "num_keywords", "data_channel_is_lifestyle", \
                                   "data_channel_is_entertainment", "data_channel_is_bus", \
                                   "data_channel_is_socmed", "data_channel_is_tech", "data_channel_is_world", \
                                   "shares"])

df_out = df_out.astype( {"Id":int, "n_tokens_title":int, "n_tokens_content":int, "n_unique_tokens":float, \
                         "n_non_stop_words":float, "n_non_stop_unique_tokens":float, "num_hrefs":int, \
                         "num_self_hrefs":int, "num_imgs":int, "num_videos":int, "average_token_length":float, \
                         "num_keywords":int, "data_channel_is_lifestyle":int, "data_channel_is_entertainment":int, \
                         "data_channel_is_bus":int, "data_channel_is_socmed":int, "data_channel_is_tech":int, \
                         "data_channel_is_world":int, "shares":int})

for index, row in df_in.iterrows():
    doc_id = row.Id
    shares = row[" shares"]
    
    if index % 100 == 0:
        print( index, ", ", end="")
        
    if doc_id in df_out.Id:
        #already processed
        continue

    html_content = ""
    with open( "../data/output/html/" + str(doc_id) + ".html", "r", encoding="utf-8", errors='ignore') as html_file:
        html_content = html_file.read()
    
    result = bfc.get_basic_features( html_content)
    
    df_out.loc[index] = [doc_id, result.n_tokens_title, result.n_tokens_content, result.n_unique_tokens, \
                            result.n_non_stop_words, result.n_non_stop_unique_tokens, result.num_hrefs, \
                            result.num_self_hrefs, result.num_imgs, result.num_videos, result.average_token_length, \
                            result.num_keywords, result.data_channel_is_lifestyle, \
                            result.data_channel_is_entertainment, result.data_channel_is_bus, \
                            result.data_channel_is_socmed, result.data_channel_is_tech, result.data_channel_is_world, \
                            shares
                        ]
    
    if index % 100 == 0:
        df_out.to_excel('../data/output/0_recreated_basic_features.xlsx', index=False)
    
df_out.to_excel('../data/output/0_recreated_basic_features.xlsx', index=False)
print()

0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 2300 , 2400 , 2500 , 2600 , 2700 , 2800 , 2900 , 3000 , 3100 , 3200 , 3300 , 3400 , 3500 , 3600 , 3700 , 3800 , 3900 , 4000 , 4100 , 4200 , 4300 , 4400 , 4500 , 4600 , 4700 , 4800 , 4900 , 5000 , 5100 , 5200 , 5300 , 5400 , 5500 , 5600 , 5700 , 5800 , 5900 , 6000 , 6100 , 6200 , 6300 , 6400 , 6500 , 6600 , 6700 , 6800 , 6900 , 7000 , 7100 , 7200 , 7300 , 7400 , 7500 , 7600 , 7700 , 
