In [1]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import logging
import empath

import re
import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from collections import Counter

import textstat

import spacy
importlib.reload(spacy)
nlp = spacy.load("en_core_web_sm")

In [3]:
def tokenize(text):
    doc = nlp(text, disable=['parser', 'tagger', 'ner'])
    
    return words_from_doc(doc)

def words_from_doc(doc): 
    res = []
    for i in doc:
        if i.is_stop:
            continue
        if i.is_punct:
            continue
            
        res.append(str(i))
    
    return res

def doc_tokens_to_file(data, file):
    with open(file, 'w', encoding='utf-8') as f:
        for i in Pbar(data):
            f.write(f"{' '.join(words_from_doc(i))}\n")

In [4]:
df = load_df('final_data.pickle')
print(len(df))

15458


---

In [5]:
spacy_body = list(nlp.pipe(df.body))
spacy_title = list(nlp.pipe(df.title))

In [6]:
tblob_body = [TextBlob(i) for i in df.body]
tblob_title = [TextBlob(i) for i in df.title]

---

In [7]:
def sentiment_features():
    res = pd.DataFrame(index=df.index)

    body_sentiment = [i.sentiment for i in tblob_body]
    res['sentiment_body_subjectivity'] = [i.subjectivity for i in body_sentiment]
    res['sentiment_body_polarity'] = [i.polarity for i in body_sentiment]
    
    
    title_sentiment = [i.sentiment for i in tblob_title]
    res['sentiment_title_subjectivity'] = [i.subjectivity for i in title_sentiment]
    res['sentiment_title_polarity'] = [i.polarity for i in title_sentiment]
    
    return res

save_df(sentiment_features(), 'features_sentiment.pickle')

In [15]:
def readability_features():
    res = pd.DataFrame(index=df.index)
    
    res['readability_body_ari'] = df.body.apply(textstat.automated_readability_index)
    res['readability_body_fcgl'] = df.body.apply(textstat.flesch_kincaid_grade)
    res['readability_body_frei'] = df.body.apply(textstat.flesch_reading_ease)
    res['readability_body_gfi'] = df.body.apply(textstat.gunning_fog)
    res['readability_body_cli'] = df.body.apply(textstat.coleman_liau_index)
    
    res['readability_title_ari'] = df.title.apply(textstat.automated_readability_index)
    res['readability_title_fcgl'] = df.title.apply(textstat.flesch_kincaid_grade)
    res['readability_title_frei'] = df.title.apply(textstat.flesch_reading_ease)
    res['readability_title_gfi'] = df.title.apply(textstat.gunning_fog)
    res['readability_title_cli'] = df.title.apply(textstat.coleman_liau_index)
    
    return res

save_df(readability_features(), 'features_readability.pickle')

In [9]:
def metadata_features():
    res = pd.DataFrame(index=df.index)
    
    res['metadata_published_at_day'] = df.published_at.dt.weekday + 1
    
    dummies = pd.get_dummies(df.source_name)
    dummies.columns = [f'source_{col}' for col in dummies.columns]
    
    res = pd.concat([res, dummies], axis=1, join='inner')
    
    return res

save_df(metadata_features(), 'features_metadata.pickle')

2020-05-06 10:32:52,605 : INFO : Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-05-06 10:32:52,606 : INFO : NumExpr defaulting to 8 threads.


In [10]:
def empath_features():
    analyzer = empath.Empath()
    
    
    res = []
    for i, doc in enumerate(spacy_body):
        lemmatized_doc = ' '.join([token.lemma_ for token in doc])
        analyzed = analyzer.analyze(lemmatized_doc, normalize=True) or {}
        em_vals = {f'empath_{k}': v for k,v in analyzed.items()}
        res.append(em_vals)
    
    return pd.DataFrame(res, index=df.index).fillna(0)

save_df(empath_features(), 'features_empath.pickle')

In [11]:
def content_features():
    res = pd.DataFrame(index=df.index)
        
    res['content_title_word_count'] = [len([t for t in doc 
                                         if t.is_punct is False and t.is_stop is False])
                                    for doc in spacy_title]
    
    
    res['content_body_word_count'] = [len([t for t in doc 
                                         if t.is_punct is False and t.is_stop is False])
                                    for doc in spacy_body]
       
    res['content_avg_word_len'] = [np.mean([len(t.text) for t in doc 
                                         if t.is_punct is False and t.is_stop is False])
                                    for doc in spacy_body]
    
    res['content_sentence_count'] = [len(list(doc.sents)) for doc in spacy_body]
    
    res['content_word_over_5ch'] = [len([t for t in doc 
                                         if t.is_punct is False and t.is_stop is False and len(t.text) > 5])
                                    for doc in spacy_body]
    
    res['content_stop_words_count'] = [len([t for t in doc if t.is_stop]) for doc in spacy_body]
    
    res['content_body_len'] = df.body.apply(lambda x: len(x))
    res['content_title_len'] = df.title.apply(lambda x: len(x))
    res['content_?_count'] = df.body.apply(lambda x: x.count('?'))
    res['content_!_count'] = df.body.apply(lambda x: x.count('!'))
    res['content_..._count'] = df.body.apply(lambda x: x.count('...'))
    
    res['content_media_count'] = df['image_count'] + df['video_count'] + df.body_urls.apply(lambda x: len([i for i in x if re.search(r'\.jpg|\.jpeg|\.png|\.bmp', i)]))
    
    pos_tags = [Counter(
                        [t.tag_ for t in doc if t.is_punct is False])for doc in spacy_body]
    pos_tags_df = pd.DataFrame(pos_tags,
                               columns=nlp.get_pipe('tagger').labels,
                               index=df.index).fillna(0)
    pos_tags_df.columns = [f'content_pos_{c}' for c in pos_tags_df.columns]
    
    res = pd.concat([res, pos_tags_df], axis=1)
    
    return res

save_df(content_features(), 'features_content.pickle')

In [12]:
def named_entities_features():
    
    ners_1 = [Counter([t.label_ for t in doc.ents]) for doc in spacy_body]
    res_1 = pd.DataFrame(ners_1, index=df.index, columns=nlp.get_pipe("ner").labels).fillna(0)
    res_1.columns = [f'named_entity_body_{c}' for c in res_1.columns]
    
    ners_2 = [Counter([t.label_ for t in doc.ents]) for doc in spacy_title]
    res_2 = pd.DataFrame(ners_2, index=df.index, columns=nlp.get_pipe("ner").labels).fillna(0)
    res_2.columns = [f'named_entity_title_{c}' for c in res_2.columns]
    
    return pd.concat([res_1, res_2], axis=1)

save_df(named_entities_features(), 'features_named_entities.pickle')

In [13]:
def popularity_features(day):
    res = pd.DataFrame(index=df.index)
    
    res[f'fb_ad_{day}_reaction_count'] = df[f'fb_ad_{day}_reaction_count']
    res[f'fb_ad_{day}_comment_count'] = df[f'fb_ad_{day}_comment_count']
    res[f'fb_ad_{day}_share_count'] = df[f'fb_ad_{day}_share_count']
    res[f'fb_popularity_ad_{day}'] = df[f'fb_popularity_ad_{day}']
    
    return res

save_df(popularity_features(0), 'features_popularity_0.pickle')
save_df(popularity_features(1), 'features_popularity_1.pickle')
save_df(popularity_features(2), 'features_popularity_2.pickle')