# In this notebook I'll be creating topics for the high-comment articles and low-comment articles.  My most successful modeling for this was with CountVec LDA, but I'll also demonstrate my modeling when done with TFIDF LDA.

In [1]:
import pandas as pd
import sklearn
import gensim
import numpy as np
import nltk
import os
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.DataFrame.from_csv("CleanedApril2018.csv")

  """Entry point for launching an IPython kernel.


# We're going to start by looking at low-comment articles

In [3]:
df_nocom=df[df.Outcome != 1]

In [4]:
from nltk.corpus import stopwords
nltk.download('punkt')
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/JonathonBowyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
stopword = stopword.union(set(['trump', 'donald', 'us', 'politics', 'j', 'united', 'states', "new", "york", "inc", "ny", "city", "manhattan", "government", 'nyc']))

In [33]:
def  chapter_reader_nocom():
    for i in df_nocom['keywords']:
        yield (x for x in 
            gensim.utils.tokenize(i, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)

In [34]:
corpus_nocom = []
dictionary = gensim.corpora.Dictionary(chapter_reader_nocom())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader_nocom():
    corpus_nocom.append(dictionary.doc2bow(values))

In [40]:
lda_nocom = gensim.models.LdaModel(corpus_nocom, id2word=dictionary, num_topics=3, random_state=23)
lda_nocom.print_topics()

[(0,
  '0.015*"tv" + 0.015*"program" + 0.008*"television" + 0.006*"art" + 0.005*"estate" + 0.005*"puzzles" + 0.005*"crossword" + 0.005*"real" + 0.005*"housing" + 0.005*"photography"'),
 (1,
  '0.007*"housing" + 0.007*"international" + 0.007*"real" + 0.007*"estate" + 0.006*"relations" + 0.006*"residential" + 0.006*"state" + 0.005*"elections" + 0.005*"play" + 0.005*"news"'),
 (2,
  '0.012*"theater" + 0.006*"play" + 0.005*"women" + 0.005*"girls" + 0.004*"crimes" + 0.004*"book" + 0.004*"program" + 0.004*"estate" + 0.004*"tv" + 0.004*"education"')]

# Now we're going to look at high-comment articles

In [44]:
df_com=df[df.Outcome == 1]

In [45]:
from nltk.corpus import stopwords
nltk.download('punkt')
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/JonathonBowyer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
stopword = stopword.union(set(['trump', 'donald', 'us', 'politics', 'j', 'united', 'states', "new", "york", "inc", "ny", "city", "manhattan", "government", 'nyc']))

In [47]:
def  chapter_reader_com():
    for i in df_com['keywords']:
        yield (x for x in 
            gensim.utils.tokenize(i, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)

In [48]:
corpus_com = []
dictionary = gensim.corpora.Dictionary(chapter_reader_com())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader_com():
    corpus_com.append(dictionary.doc2bow(values))

In [51]:
lda_com = gensim.models.LdaModel(corpus_com, id2word=dictionary, num_topics=3, random_state=5)
lda_com.print_topics()

[(0,
  '0.014*"elections" + 0.012*"party" + 0.007*"republican" + 0.006*"state" + 0.006*"ties" + 0.006*"democratic" + 0.006*"national" + 0.006*"house" + 0.005*"international" + 0.005*"associates"'),
 (1,
  '0.007*"international" + 0.006*"elections" + 0.005*"house" + 0.005*"jr" + 0.004*"news" + 0.004*"control" + 0.004*"representatives" + 0.004*"federal" + 0.004*"department" + 0.004*"syria"'),
 (2,
  '0.014*"international" + 0.011*"relations" + 0.007*"defense" + 0.007*"elections" + 0.007*"interference" + 0.007*"department" + 0.007*"associates" + 0.006*"russian" + 0.006*"military" + 0.006*"ties"')]

# Now we'll do the same, except using TFIDF.  We'll start with low-comment articles

In [9]:
def  chapter_reader_nocom():
    for i in df_nocom['keywords']:
        yield (x for x in 
            gensim.utils.tokenize(i, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)

In [10]:
corpus_nocom = []
dictionary = gensim.corpora.Dictionary(chapter_reader_nocom())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader_nocom():
    corpus_nocom.append(dictionary.doc2bow(values))

In [11]:
tfidf = gensim.models.TfidfModel(corpus_nocom, normalize=True)
corpus_nocom_tfidf = tfidf[corpus_nocom]

In [37]:
lda_nocom = gensim.models.LdaModel(corpus_nocom_tfidf, id2word=dictionary, num_topics=5, random_state=0)
lda_nocom.print_topics()

[(0,
  '0.006*"television" + 0.006*"tv" + 0.006*"program" + 0.003*"billions" + 0.002*"city" + 0.002*"australia" + 0.002*"trust" + 0.002*"traffic" + 0.002*"upper" + 0.002*"inc"'),
 (1,
  '0.005*"estate" + 0.005*"real" + 0.005*"housing" + 0.005*"residential" + 0.004*"nyc" + 0.004*"program" + 0.004*"tv" + 0.003*"travel" + 0.003*"vacations" + 0.003*"television"'),
 (2,
  '0.004*"theater" + 0.004*"books" + 0.004*"literature" + 0.004*"puzzles" + 0.004*"crossword" + 0.003*"rights" + 0.003*"play" + 0.002*"inc" + 0.002*"jr" + 0.002*"civil"'),
 (3,
  '0.003*"education" + 0.003*"life" + 0.003*"city" + 0.003*"real" + 0.003*"estate" + 0.003*"housing" + 0.003*"program" + 0.003*"tv" + 0.003*"family" + 0.002*"families"'),
 (4,
  '0.007*"puzzles" + 0.006*"crossword" + 0.003*"international" + 0.003*"trade" + 0.003*"china" + 0.002*"world" + 0.002*"market" + 0.002*"france" + 0.002*"theater" + 0.002*"vacations"')]

# TFIDF for high-comment articles

In [40]:
def  chapter_reader():
    for i in df_com['keywords']:
        yield (x for x in 
            gensim.utils.tokenize(i, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stopword)

In [41]:
corpus = []
dictionary = gensim.corpora.Dictionary(chapter_reader())
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)
for values in chapter_reader():
    corpus.append(dictionary.doc2bow(values))

In [42]:
tfidf = gensim.models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

In [49]:
lda = gensim.models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=5, random_state=4)
lda.print_topics()

[(0,
  '0.004*"party" + 0.004*"republican" + 0.003*"defense" + 0.003*"military" + 0.003*"forces" + 0.003*"iii" + 0.003*"elections" + 0.003*"intelligence" + 0.003*"robert" + 0.003*"rights"'),
 (1,
  '0.006*"puzzles" + 0.006*"crossword" + 0.004*"elections" + 0.003*"interference" + 0.003*"russian" + 0.003*"associates" + 0.003*"ties" + 0.003*"syria" + 0.003*"inc" + 0.003*"cohen"'),
 (2,
  '0.006*"immigration" + 0.003*"ryan" + 0.003*"representatives" + 0.003*"international" + 0.003*"house" + 0.003*"paul" + 0.003*"illegal" + 0.003*"party" + 0.003*"emigration" + 0.003*"jr"'),
 (3,
  '0.005*"elections" + 0.005*"crossword" + 0.005*"puzzles" + 0.004*"international" + 0.004*"trade" + 0.003*"party" + 0.003*"book" + 0.003*"house" + 0.003*"inc" + 0.003*"market"'),
 (4,
  '0.004*"international" + 0.003*"federal" + 0.003*"relations" + 0.003*"news" + 0.003*"syria" + 0.003*"defense" + 0.003*"media" + 0.003*"elections" + 0.003*"trade" + 0.003*"e"')]