In [81]:
import pandas as pd
import textblob
from textblob import TextBlob
import dateutil
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords ##Note you'll need to download NLTK and corpuses
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()
import string
import re
import gensim

In [22]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()
#     text = str(TextBlob(text).correct())
    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):
    # get the tokens using spaCy
    tokens = parser(cleanText(sample))

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

In [119]:
ps_df = pd.read_pickle('ps_df_2.pkl')

In [None]:
## Convert to date time
def conv_date(x):
    return dateutil.parser.parse(x)

ps_df.date_published = ps_df.date_published.apply(conv_date)

In [None]:
## Add sentiment analysis
sentiment_score = []
for row in range(ps_df.shape[0]):
    sentiment_score.append(TextBlob(ps_df.loc[row,'articles']).sentiment.polarity)
sentiment_score = pd.Series(sentiment_score)

##Normalize sentiment score
sentiment_score = ((sentiment_score + abs(sentiment_score.min()))/(sentiment_score.max()+abs(sentiment_score.min())))

ps_df['sent_score'] = sentiment_score

In [None]:
## Add subjectivity analysis

subjectivity_score = []
for row in range(ps_df.shape[0]):
    subjectivity_score.append(TextBlob(ps_df.loc[row,'articles']).sentiment.subjectivity)
subjectivity_score = pd.Series(subjectivity_score)
ps_df['subj_score'] = subjectivity_score

In [None]:
## Add length of article
length = []
for row in range(ps_df.shape[0]):
    length.append(len(ps_df.loc[row,'articles'].split()))
ps_df['length'] = pd.Series(length)

## Limit to articles with over 500 words only
ps_df = ps_df[ps_df['length']>=500].reset_index(drop=True)

In [None]:
## Add summary to each article
summaries = []
for row in range(ps_df.shape[0]):
    summaries.append(gensim.summarization.summarize(ps_df.loc[row,'articles']))
ps_df['summary'] = pd.Series(summaries)

In [None]:
## Countvectorizer
countvectorizer = CountVectorizer(tokenizer=tokenizeText,strip_accents='unicode',ngram_range=(1,4),min_df=0.005,max_df=0.995)

In [None]:
count_vector = countvectorizer.fit_transform(ps_df.loc[:,'articles'])

In [None]:
count_df = pd.DataFrame(count_vector.A,columns=countvectorizer.vocabulary_)

In [None]:
processed_df = pd.concat((ps_df,count_df),axis=1)