In [91]:
# initial imports
import pandas as pd
import numpy as np
import os
import re

# nltk sentiment analysis
import nltk
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# textblob sentiment analysis
#!pip install textblob
from textblob import TextBlob

# LDA
from sklearn.decomposition import LatentDirichletAllocation

# word processing using gensim packages
#!pip install gensim
import gensim
from gensim import corpora, models
#!python -m gensim.scripts.make_wiki
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords


data_path = "~/Desktop/NewsGenerator/data/"

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Isabelle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Isabelle/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


First, we load all data into one dataframe

In [3]:
df1 = pd.read_csv(data_path+"articles1.csv").drop("Unnamed: 0", axis=1)
df2 = pd.read_csv(data_path+"articles2.csv").drop("Unnamed: 0", axis=1)
df3 = pd.read_csv(data_path+"articles3.csv").drop("Unnamed: 0", axis=1)

In [4]:
df = pd.concat([df1, df2, df3])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142570 entries, 0 to 42570
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           142570 non-null  int64  
 1   title        142568 non-null  object 
 2   publication  142570 non-null  object 
 3   author       126694 non-null  object 
 4   date         139929 non-null  object 
 5   year         139929 non-null  float64
 6   month        139929 non-null  float64
 7   url          85559 non-null   object 
 8   content      142570 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 10.9+ MB


### Types of Publications

we first need to figure out what types of articles are there. So, we can print the unique names of publications and how to define the project better. A couple rudimentary analysis is done to get basic statistics like counts of publications, and how I've divided up the publications.

In [6]:
pubs = np.unique(df.publication)
pubs

array(['Atlantic', 'Breitbart', 'Business Insider', 'Buzzfeed News',
       'CNN', 'Fox News', 'Guardian', 'NPR', 'National Review',
       'New York Post', 'New York Times', 'Reuters',
       'Talking Points Memo', 'Vox', 'Washington Post'], dtype=object)

In [7]:
total_count = 142570
for x in pubs:
    print(x)
    n = df[df.publication == x].shape[0]
    print("\tNumber of Articles:",n)
    print("\tFraction of the dataset:",round(n/total_count*100,2))

Atlantic
	Number of Articles: 7179
	Fraction of the dataset: 5.04
Breitbart
	Number of Articles: 23781
	Fraction of the dataset: 16.68
Business Insider
	Number of Articles: 6757
	Fraction of the dataset: 4.74
Buzzfeed News
	Number of Articles: 4854
	Fraction of the dataset: 3.4
CNN
	Number of Articles: 11488
	Fraction of the dataset: 8.06
Fox News
	Number of Articles: 4354
	Fraction of the dataset: 3.05
Guardian
	Number of Articles: 8681
	Fraction of the dataset: 6.09
NPR
	Number of Articles: 11992
	Fraction of the dataset: 8.41
National Review
	Number of Articles: 6203
	Fraction of the dataset: 4.35
New York Post
	Number of Articles: 17493
	Fraction of the dataset: 12.27
New York Times
	Number of Articles: 7803
	Fraction of the dataset: 5.47
Reuters
	Number of Articles: 10710
	Fraction of the dataset: 7.51
Talking Points Memo
	Number of Articles: 5214
	Fraction of the dataset: 3.66
Vox
	Number of Articles: 4947
	Fraction of the dataset: 3.47
Washington Post
	Number of Articles: 11114


In [8]:
df[df.publication == "Breitbart"].shape[0]

23781

In [9]:
df[df.publication == "Fox News"].shape[0]

4354

### Sentiment Analysis

In [10]:
sid = SentimentIntensityAnalyzer()

We can first take a look at NYT article.

In [11]:
df.iloc[1].title

'Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times'

In [12]:
nyt = df.iloc[1]
nyt

id                                                         17284
title          Rift Between Officers and Residents as Killing...
publication                                       New York Times
author                             Benjamin Mueller and Al Baker
date                                                  2017-06-19
year                                                        2017
month                                                          6
url                                                          NaN
content        After the bullet shells get counted, the blood...
Name: 1, dtype: object

In [13]:
sid.polarity_scores(nyt.content)

{'neg': 0.157, 'neu': 0.784, 'pos': 0.059, 'compound': -1.0}

Then, we take a look at a random Breitbart article.

In [14]:
df.iloc[10001].title

'Watch: Spicer Asked How It Feels ’To Work for a Fascist?’ In Apple Store - Breitbart'

In [15]:
bb = df.iloc[10001]
bb

id                                                         28737
title          Watch: Spicer Asked How It Feels ’To Work for ...
publication                                            Breitbart
author                                              Ian Hanchett
date                                                  2017-03-12
year                                                        2017
month                                                          3
url                                                          NaN
content        Asking @PressSec questions in Apple Store sinc...
Name: 10001, dtype: object

In [16]:
sid.polarity_scores(bb.content)

{'neg': 0.156, 'neu': 0.755, 'pos': 0.089, 'compound': -0.9282}

Despite such a different content and political polemics, the sentiment actually doesn't seem to have changed all that much.  Mostly, the articles both point to neutral.

What if we wanted to use a different sentiment analysis tool?

Let's try textblob.

In [17]:
blob_nyt = TextBlob(nyt.content)
blob_bb = TextBlob(bb.content)

In [18]:
blob_nyt.sentiment

Sentiment(polarity=-0.0017503893246467515, subjectivity=0.3917658789688492)

In [20]:
blob_bb.sentiment

Sentiment(polarity=0.028571428571428574, subjectivity=0.3380952380952381)

Seems like a random breitbart article is more polar than a random nyt article. How about we try a couple more?

Breitbart:

In [21]:
bb1 = df.iloc[10003]
bb2 = df.iloc[10005]
bb3 = df.iloc[10007]

In [22]:
TextBlob(bb1.content).sentiment

Sentiment(polarity=0.06732348111658457, subjectivity=0.45755336617405584)

In [23]:
TextBlob(bb2.content).sentiment

Sentiment(polarity=0.09898932506887052, subjectivity=0.35658057851239666)

In [24]:
TextBlob(bb3.content).sentiment

Sentiment(polarity=-0.0142857142857143, subjectivity=0.7)

In [25]:
bb3.title

'Noonan: America Is Not Obsessed With the ’Comey Drama’ Like DC - Breitbart'

In [26]:
nyt1 = df.iloc[3]
nyt2 = df.iloc[5]
nyt3 = df.iloc[7]

TextBlob(nyt1.content).sentiment

Sentiment(polarity=0.1297952794444023, subjectivity=0.43564610178645263)

In [27]:
TextBlob(nyt2.content).sentiment

Sentiment(polarity=-0.0678030303030303, subjectivity=0.5378787878787878)

In [28]:
TextBlob(nyt3.content).sentiment

Sentiment(polarity=0.062423583212209866, subjectivity=0.45431057616679965)

In [29]:
nyt1.title

'Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times'

Seems like there isn't too much difference in textblob's sentiment analysis either, just from random samples of a few.

### Topic Clustering with LDA

First we divide up the large dataset by the names of publication.

In [34]:
pubs

array(['Atlantic', 'Breitbart', 'Business Insider', 'Buzzfeed News',
       'CNN', 'Fox News', 'Guardian', 'NPR', 'National Review',
       'New York Post', 'New York Times', 'Reuters',
       'Talking Points Memo', 'Vox', 'Washington Post'], dtype=object)

In [43]:
pubs_df = {}

for x in pubs:
    pubs_df[x] = df[df.publication == x].content


In [46]:
bb = pubs_df["Breitbart"].reindex()

In [95]:
bb = bb.values

In [114]:
stop_words = stopwords.words('english')

def corpus_process(corpus):
    return nltk.tokenize.sent_tokenize(corpus)

def sentence_process(processed_corpus):
    tokenized_sentences = []
    for sentence in processed_corpus:
        tokenized_sentences.append(simple_preprocess(sentence, deacc=True))
    return tokenized_sentences

def process_entire_publication(pub):
    processed_pubs = []
    
    for article in pub:
        processed_corpus = corpus_process(article)
        tokenized_corpus = sentence_process(processed_corpus)
        processed_pubs.append(tokenized_corpus)
    
    return processed_pubs

In [133]:
processed_bb = process_entire_publication(bb)    

In [154]:
def compose_corpus(processed_pub):
    corpus = []

    for articles in processed_pub:
        a = []
        for s in articles:
            for t in s:
                if t not in stop_words:
                    a.append(t)
        corpus.append(a)
    return corpus

In [155]:
texts = compose_corpus(processed_bb)

In [156]:
id2word = corpora.Dictionary(texts)

In [157]:
corpus = [id2word.doc2bow(text) for text in texts]

In [160]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,\
                                           id2word=id2word,\
                                           num_topics=20, \
                                           random_state=0,\
                                           update_every=1,\
                                           chunksize=100,\
                                           passes=10,\
                                           alpha='auto',\
                                           per_word_topics=True)

In [161]:
lda_model.print_topics()

[(0,
  '0.048*"muslim" + 0.042*"islamic" + 0.027*"muslims" + 0.025*"attack" + 0.023*"attacks" + 0.023*"terrorist" + 0.022*"islam" + 0.018*"israel" + 0.017*"terror" + 0.014*"state"'),
 (1,
  '0.120*"women" + 0.038*"men" + 0.032*"children" + 0.031*"woman" + 0.027*"sexual" + 0.026*"child" + 0.023*"sex" + 0.022*"planned" + 0.022*"abortion" + 0.019*"cent"'),
 (2,
  '0.039*"said" + 0.028*"people" + 0.017*"going" + 0.016*"think" + 0.016*"like" + 0.015*"would" + 0.014*"get" + 0.013*"know" + 0.011*"want" + 0.011*"say"'),
 (3,
  '0.054*"mr" + 0.044*"migrants" + 0.033*"party" + 0.031*"european" + 0.031*"eu" + 0.028*"europe" + 0.025*"london" + 0.023*"britain" + 0.021*"british" + 0.018*"german"'),
 (4,
  '0.092*"school" + 0.046*"education" + 0.045*"students" + 0.034*"common" + 0.033*"core" + 0.032*"virus" + 0.032*"schools" + 0.030*"cases" + 0.027*"records" + 0.021*"district"'),
 (5,
  '0.154*"china" + 0.067*"wikileaks" + 0.061*"chinese" + 0.027*"lucas" + 0.025*"vietnam" + 0.024*"revelations" + 0.01