In [1]:
import sys

import importlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sqlalchemy
import logging
import empath

import common
import util
importlib.reload(common)
importlib.reload(util)

from common import create_engine
from common import display_all
from common import figsize
from common import save_df, load_df
from common import save_session, load_session

from util import show_importances
from util import split_X_y_all, split_X_y, split_data
from util import empty_features, column_feature, str_contains

from pbar import Pbar

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters() # converters e.g. for datetime in plots
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from collections import Counter

import textstat

import spacy
importlib.reload(spacy)
nlp = spacy.load("en_core_web_sm")

ImportError: cannot import name 'prefer_gpu' from 'thinc.neural.util' (C:\Users\kamko\Anaconda3\lib\site-packages\thinc\neural\util.py)

In [29]:
df = load_df('final_data.pickle')
print(len(df))

20279


In [44]:
d2v_features = pd.DataFrame(index=df.index)
tfidf_features = pd.DataFrame(index=df.index)
popularity_features = pd.DataFrame(index=df.index)
named_entities_features

---

In [30]:
spacy_body = list(nlp.pipe(df.body))
spacy_title = list(nlp.pipe(df.title))

In [37]:
tblob_body = [TextBlob(i) for i in df.body]
tblob_title = [TextBlob(i) for i in df.title]

---

In [51]:
def sentiment_features():
    res = pd.DataFrame(index=df.index)

    body_sentiment = [i.sentiment for i in tblob_body]
    res['sentiment_body_subjectivity'] = [i.subjectivity for i in body_sentiment]
    res['sentiment_body_polarity'] = [i.polarity for i in body_sentiment]
    
    
    title_sentiment = [i.sentiment for i in tblob_title]
    res['sentiment_title_subjectivity'] = [i.subjectivity for i in title_sentiment]
    res['sentiment_title_polarity'] = [i.polarity for i in title_sentiment]
    
    return res

save_df(sentiment_features(), 'features_sentiment.pickle')

In [53]:
def readability_features():
    res = pd.DataFrame(index=df.index)
    
    res['readability_ari'] = df.body.apply(textstat.automated_readability_index)
    res['readability_fcgl'] = df.body.apply(textstat.flesch_kincaid_grade)
    res['readability_frei'] = df.body.apply(textstat.flesch_reading_ease)
    res['readability_gfi'] = df.body.apply(textstat.gunning_fog)
    res['readability_cli'] = df.body.apply(textstat.coleman_liau_index)
    
    return res

save_df(readability_features(), 'features_readability.pickle')

In [54]:
def metadata_features():
    res = pd.DataFrame(index=df.index)
    
    res['metadata_published_at_day'] = df.published_at.dt.weekday + 1
    res['metadata_source_name'] = df.source_name
    
    return res

save_df(metadata_features(), 'features_metadata.pickle')

2020-05-04 19:13:56,791 : INFO : Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-05-04 19:13:56,792 : INFO : NumExpr defaulting to 8 threads.


In [75]:
def empath_features():
    analyzer = empath.Empath()
    
    
    res = []
    for doc in spacy_body:
        lemmatized_doc = ' '.join([token.lemma_ for token in doc])
        em_vals = {f'empath_{k}': v for k,v in analyzer.analyze(lemmatized_doc).items()}
        res.append(em_vals)
    
    return pd.DataFrame(res, index=df.index)

save_df(empath_features(), 'features_empath.pickle')

In [126]:
def content_features():
    res = pd.DataFrame(index=df.index)
        
    res['content_title_word_count'] = [len([t for t in doc 
                                         if t.is_punct is False and t.is_stop is False])
                                    for doc in spacy_title]
    
    
    res['content_body_word_count'] = [len([t for t in doc 
                                         if t.is_punct is False and t.is_stop is False])
                                    for doc in spacy_body]
       
    res['content_avg_word_len'] = [np.mean([len(t.text) for t in doc 
                                         if t.is_punct is False and t.is_stop is False])
                                    for doc in spacy_body]
    
    res['content_sentence_count'] = [len(list(doc.sents)) for doc in spacy_body]
    
    res['content_word_over_5ch'] = [len([t for t in doc 
                                         if t.is_punct is False and t.is_stop is False and len(t.text) > 5])
                                    for doc in spacy_body]
    
    res['content_stop_words_count'] = [len([t for t in doc if t.is_stop]) for doc in spacy_body]
    
    res['content_body_len'] = df.body.apply(lambda x: len(x))
    res['content_title_len'] = df.title.apply(lambda x: len(x))
    res['content_?_count'] = df.body.apply(lambda x: x.count('?'))
    res['content_!_count'] = df.body.apply(lambda x: x.count('!'))
    res['content_..._count'] = df.body.apply(lambda x: x.count('...'))
    
    res['content_media_count'] = df['image_count'] + df['video_count']
    
    pos_tags = [Counter([t.tag_ for t in doc if t.is_punct is False]) for doc in spacy_body]
    pos_tags_df = pd.DataFrame(pos_tags, columns=nlp.get_pipe('tagger').labels, index=df.index).fillna(0)
    pos_tags_df.columns = [f'content_pos_{c}' for c in pos_tags_df.columns]
    
    res = pd.concat([res, pos_tags_df], axis=1)
    
    return res

save_df(content_features(), 'features_content.pickle')

In [None]:
def named_entities_features():
    
    ners = [Counter([t.label_ for t in doc.ents]) for doc in spacy_body]
    res = pd.DataFrame(ners, index=df.index. columns=nlp.get_pipe("ner").labels).fillna(0)
    res.columns = [f'named_entity_{c}' for c in res.columns]
    
    return res


In [131]:
for i in spacy_body[0].ents:
    print(i.text, i.label_)
    
nlp.get_pipe("ner").labels

thousands of years DATE
first ORDINAL
thousands of years DATE
around 1000 CARDINAL
the past few decades DATE
2 CARDINAL
one CARDINAL
one CARDINAL
first ORDINAL
daily DATE
daily DATE
days DATE
4 CARDINAL
first ORDINAL
over 1000 CARDINAL
only one CARDINAL
two CARDINAL
75% PERCENT
second ORDINAL
another fall season DATE
first ORDINAL


AttributeError: 'spacy.pipeline.EntityRecognizer' object has no attribute 'labels'