# LDA Topic Modeling on Named Entities

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
df = pd.read_csv('Cleaned_Data.csv')

In [60]:
df_sub = df[['content','publication','sentiments']]

In [61]:
df_sub = df_sub.sample(frac= .1).reset_index()

In [67]:
print(len(df_sub))
print(len(df))

13510
135097


### Computing NLP annotations not possible for data of this size, so just extracting entities for a random sample

In [68]:
df_sub.entities = df_sub.content.progress_apply(lambda x: nlp(x).ents)

100%|██████████████████████████████████| 13510/13510 [1:51:29<00:00,  2.45it/s]


In [159]:
df_sub['entities'] = df_sub.entities

In [160]:
df_sub.head()

Unnamed: 0,index,content,publication,sentiments,entities
0,97145,The Japanese air bag manufacturer Takata has r...,NPR,-0.9791,"((Japanese), (Takata), ($, 1, billion), (the, ..."
1,72518,Can’t imagine pitying the bank account of an N...,New York Post,0.9792,"((NFL), (’s), (the, Super, Bowl), (a, few, yea..."
2,82968,"With summer nearing, my thoughts turn to the C...",New York Post,0.9849,"((summer), (the, Clam, Bar), (Montauk, Highway..."
3,110360,"Last week, President Donald J. Trump chose th...",Reuters,-0.9975,"((Last, week), (Donald, J., Trump), (U., S.), ..."
4,99964,The Marine Corps has released a recruiting ad ...,NPR,-0.7496,"((The, Marine, Corps), (first), (Marine), (Cor..."


In [161]:
df_sub['entities'] = df_sub['entities'].progress_apply(lambda x: list(x))

100%|█████████████████████████████████| 13510/13510 [00:00<00:00, 22343.38it/s]


In [162]:
df_sub['entities'] = df_sub['entities'].progress_apply(lambda x: [str(word) for word in x])

100%|███████████████████████████████████| 13510/13510 [00:42<00:00, 315.23it/s]


In [163]:
for x in range(len(df_sub['entities'])):
    good = []
    for word in df_sub['entities'][x]:
        word = str(word)
        word = word.lower()
        if word not in ['one','two','three','four','five','six','seven','eight','nine','ten']:
            if word.isdigit() == False:
                word = word.strip()
                if len(str(word)) > 3:
                    good += [word]

    if x%1000 == 0:
        print(x)

    df_sub['entities'][x] = good

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


In [170]:
def hasNumbers(inputString):
...     return any(char.isdigit() for char in inputString)

In [173]:
for x in range(len(df_sub['entities'])):
    good = []
    for word in df_sub['entities'][x]:
        if hasNumbers(word) == False:
            good += [word]
    if x%1000 == 0:
        print(x)
    df_sub['entities'][x] = good

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000


In [174]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [175]:
def dummy(doc):
    return doc

In [176]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [179]:
def pyLDA(pub, num_topics=20):
    tf_vectorizer = CountVectorizer(max_df = 0.8, min_df = 2, tokenizer=dummy,preprocessor=dummy)
    dtm_tf = tf_vectorizer.fit_transform(pub)
    lda_tf = LatentDirichletAllocation(num_topics, batch_size = 200, evaluate_every=-1, learning_method='online', learning_offset = 10.0)
    lda_tf.fit(dtm_tf)
    return pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

## Recommend lowering relevance metric to 0.2 - 0.5

In [180]:
pyLDA(df_sub['entities'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
