# News Article and Title NLP and LDA

# 1. Natural Language Processing
Adapted NLP portions from https://github.com/GarrettHoffman/digital_media_shares_optimization

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [None]:
# import data
inputData = pd.read_csv('File.csv')

In [None]:
inputData.shape

In [None]:
# check length of final data set
len(inputData)

In [None]:
inputData.head()

# feature engineering

In [None]:
from __future__ import division
import numpy as np
import pymongo
import nltk
from textblob import TextBlob
import string
from nltk.corpus import stopwords
from textstat.textstat import textstat

In [None]:
stop = stopwords.words('english')

def engineer_NLP_features(inputData):

    """
    Generate NLP fatures (related to language and sentiment)
    for articles to be used in predicting no. of 
    reads

    Arguments:
    inputData: document contating article content data

    Output:
    Stores NLP features results for Document
    """

    # get article headline and article content 

    headline = str(inputData['Title'])
    content = str(inputData['ArticleText'])

    # generate headline features

    # number of words in title
    n_tokens_title = len(headline.split())

    # subjectivity
    title_subjectivity = TextBlob(headline).subjectivity

    # polarity
    title_sentiment_polarity = TextBlob(headline).polarity

    # absolute value polarirty
    title_sentiment_abs_polarity = abs(title_sentiment_polarity)

    # average word length
    average_token_length_title = np.mean([len(w) for w 
                                          in "".join(c for c in headline 
                                                     if c not in string.punctuation).split()])

    #generate content features

    # number of words
    n_tokens_content = len([w for w in content.split()])

    # rate of unique words
    r_unique_tokens = len(set([w.lower()
                               for w 
                               in "".join(c for c in content 
                                          if c not in string.punctuation).split()]))/n_tokens_content

    # rate of non-stop word
    r_non_stop_words = len([w.lower() 
                            for w in "".join(c for c in content 
                                             if c not in string.punctuation).split() 
                            if w 
                            not in stop])/n_tokens_content

    # rate of unique non-stop word
    r_non_stop_unique_tokens = len(set([w.lower() 
                               for w in "".join(c for c in content 
                                                if c not in string.punctuation).split() 
                               if w
                               not in stop]))/n_tokens_content

    # average word length
    average_token_length_content = np.mean([len(w) for w 
                                            in "".join(c for c in content
                                                       if c not in string.punctuation).split()])

    # subjectivity
    global_subjectivity = TextBlob(content).subjectivity

    # polarity
    global_sentiment_polarity = TextBlob(content).polarity

    # absolute polarity
    global_sentiment_abs_polarity = abs(global_sentiment_polarity)

    # get polarity by word
    polarity_list = [(w, TextBlob(w).polarity) 
                     for w in "".join(c for c in content 
                                      if c not in string.punctuation).split()]

    # global positive word rate
    global_rate_positive_words = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p > 0])/len(polarity_list)

    # global negative word rate
    global_rate_negative_words = len([(w,p) 
                                      for (w,p) 
                                      in polarity_list 
                                      if p < 0])/len(polarity_list)

    # positive word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        rate_positive_words = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p > 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])
    else:
        rate_positive_words = 0

    # negative word rate (among non-nuetral words)
    if [(w,p) for (w,p) in polarity_list if p != 0]:
        rate_negative_words = len([(w,p) 
                                   for (w,p) 
                                   in polarity_list 
                                   if p < 0])/len([(w,p) 
                                                   for (w,p) 
                                                   in polarity_list 
                                                   if p != 0])

    else:
       rate_negative_words = 0 

    # average polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        avg_positive_polarity = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p > 0])
    else:
        avg_positive_polarity = 0

    # minimum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        min_positive_polarity = min([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else:
        min_positive_polarity = 0

    # maximum polarity of positive words
    if [p for (w,p) in polarity_list if p > 0]:
        max_positive_polarity = max([p for (w,p) 
                                     in polarity_list 
                                     if p > 0])
    else: 
        max_positive_polarity = 0

    # average polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        avg_negative_polarity = np.mean([p for (w,p) 
                                         in polarity_list 
                                         if p < 0])
    else:
        avg_negative_polarity = 0

    # minimum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        min_negative_polarity = min([p for (w,p) 
                                     in polarity_list 
                                     if p < 0])
    else:
        min_negative_polarity = 0

    # maximum polarity of negative words
    if [p for (w,p) in polarity_list if p < 0]:
        max_negative_polarity = max([p for (w,p) 
                                 in polarity_list 
                                 if p < 0])
    else:
        max_negative_polarity = 0

    # abs maximum polarity, sum of abs of max positive and abs of min negative polarity
    max_abs_polarity = max_positive_polarity + abs(min_negative_polarity)

    # Flesch Reading Ease
    global_reading_ease = textstat.flesch_reading_ease(content)

    # Flesch Kincaid Grade Level
    global_grade_level = textstat.flesch_kincaid_grade(content)

    res = {"headline": inputData["Title"],
            "content": inputData["ArticleText"],
            "n_tokens_title": n_tokens_title, 
                                    "title_subjectivity": title_subjectivity,
                                    "title_sentiment_polarity": title_sentiment_polarity,
                                    "title_sentiment_abs_polarity": title_sentiment_abs_polarity,
                                    "average_token_length_title": average_token_length_title,
                                    "n_tokens_content": n_tokens_content,
                                    "r_unique_tokens": r_unique_tokens,
                                    "r_non_stop_words": r_non_stop_words,
                                    "r_non_stop_unique_tokens": r_non_stop_unique_tokens,
                                    "average_token_length_content": average_token_length_content,
                                    "global_subjectivity": global_subjectivity,
                                    "global_sentiment_polarity": global_sentiment_polarity,
                                    "global_sentiment_abs_polarity": global_sentiment_abs_polarity,
                                    "global_rate_positive_words": global_rate_positive_words,
                                    "global_rate_negative_words": global_rate_negative_words,
                                    "rate_positive_words": rate_positive_words,
                                    "rate_negative_words": rate_negative_words,
                                    "avg_positive_polarity": avg_positive_polarity,
                                    "min_positive_polarity": min_positive_polarity,
                                    "max_positive_polarity": max_positive_polarity,
                                    "avg_negative_polarity": avg_negative_polarity,
                                    "min_negative_polarity": min_negative_polarity,
                                    "max_negative_polarity": max_negative_polarity,
                                    "max_abs_polarity": max_abs_polarity,
                                    "global_reading_ease": global_reading_ease,
                                    "global_grade_level": global_grade_level}
    
    return(pd.DataFrame.from_dict(res, orient = 'index').transpose())

In [None]:
res = []

for index, row in inputData.head(142).iterrows():
    
    #print(row['ArticleText'])
    
    if index is 0:
        res = engineer_NLP_features(row)
    else:
        res = res.append(engineer_NLP_features(row))


print(res)

In [None]:
res.to_csv('res_dataset.csv')

# 2. LDA model features

In [None]:

import pandas as pd
import re
import time
import nltk
from numpy import *

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaModel

In [None]:
import pyLDAvis
import pyLDAvis.gensim

In [None]:
df = pd.read_csv('File.csv')

In [None]:
df.head()

In [None]:
df.shape

# Clean the reviews
Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma .

In [None]:
%%time
wordnet_lemmatizer = WordNetLemmatizer()
stopset = list(set(stopwords.words('english')))
clean_reviews_text = []
for review in df['ArticleText']:  # Loop through the tokens (the words or symbols) in each review. 
    try:    
        cleaned_review = re.sub("[^a-zA-Z]"," ", review)  # Remove numbers and punctuation.
        cleaned_review = cleaned_review.lower()  # Convert the text to lower case.
        cleaned_review = ' '.join([word for word in cleaned_review.split() if word not in stopset])  # Keep only words that are not stopwords.
        cleaned_review = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='n') for word in cleaned_review.split()])  # Keep each noun's lemma.
        cleaned_review = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='v') for word in cleaned_review.split()])  # Keep each verb's lemma.
        cleaned_review = re.sub(r"(http\S+)"," ", cleaned_review)  # Remove http links.
        cleaned_review = ' '.join(cleaned_review.split())  # Remove white space.
    except TypeError:
        pass
    clean_reviews_text.append(cleaned_review)

In [None]:
df['cleanText'] = clean_reviews_text
df.head()

In [None]:
print('ORIGINAL: ' + df.iloc[0]['ArticleText'])
print(' ')
print('CLEANED: ' + df.iloc[0]['cleanText'])

In [None]:
df['cleanText'].to_csv('cleanText.csv')

# Perform semantic analysis using LDA.
Preprocess the reviews by creating a dictionary of words used and a bag-of-words corpus. Note that each of the steps below takes several minutes.

In [None]:
%%time
tokens_by_doc = [review.split() for review in clean_reviews_text]
dictionary = corpora.Dictionary(tokens_by_doc)
bow_corpus = [dictionary.doc2bow(tokens) for tokens in tokens_by_doc]

In [None]:
%%time
lda_model = LdaModel(bow_corpus, num_topics=10, id2word=dictionary, random_state=201)

In [None]:
%%time
topics_list = []
for doc_topics in lda_model.get_document_topics(bow_corpus):
    topics_list.append(sorted(doc_topics, key=lambda doc: -doc[1])[0][0])

In [None]:
lda_model.get_document_topics(bow_corpus[0])

In [None]:
df['Top Topic'] = topics_list
df.head()

In [None]:
lda_model.show_topic(topicid=5, topn=20)

# Visualize your LDA results
Preparing the visualization will take several minutes

In [None]:
%%time
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

In [None]:
sort=False

In [None]:
%%time
pyLDAvis.display(LDAvis_prepared)

In [None]:
import pandas as pd

mixture = [dict(lda_model[x]) for x in bow_corpus]
pd.DataFrame(mixture).to_csv("topic_mixture86.csv")

In [None]:
top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 5)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words86.csv")