# Clown Legend Cascade of 2016: Topics

With thanks to [Usman Malik][].

[Usman Malik]: https://stackabuse.com/python-for-nlp-topic-modeling/

In [1]:
%matplotlib inline

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# IMPORTS & FUNCTIONS
# =-=-=-=-=-=-=-=-=-=-= 

import pandas, re
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def string_test(s):
    if s is None:
        return ''
    else:
        return str(s)

In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# DATA
# =-=-=-=-=-=-=-=-=-=-=

df = pandas.read_csv('./clowns_3.csv')

all_texts = df.Text.tolist()
news = df[df["Origin"] == "News Report"].Origin.tolist()
social = df[df["Origin"] == "Social Media"].Origin.tolist()

# Check for string (in case of nan)
all_strings = [ string_test(text) for text in texts ]
news_strings = [string_test(i) for i in news]
social_strings = [string_test(i) for i in social]

# Eliminate carriage returns
legends = []
for string in strings:
    string = string.replace(u'\xa0', u' ')
    legends.append(string)

# # TOKENIZE -- not needed for Sci-Kit Learn
# tokenizer = WhitespaceTokenizer()
# tokenized = []
# for legend in legends:   
#     raw = re.sub(r"[^\w\d'\s]+",'', legend).lower()
#     tokens = tokenizer.tokenize(raw)
#     tokenized.append(tokens)

In [15]:
count_vect = CountVectorizer(min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(legends)

In [16]:
doc_term_matrix

<182x4032 sparse matrix of type '<class 'numpy.int64'>'
	with 31101 stored elements in Compressed Sparse Row format>

In [22]:
LDA = LatentDirichletAllocation(n_components=10, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [24]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['wasn', 'going', 'let', 'said', 'don', 'time', 'didn', 'like', 'clown', 'just']


Top 10 words for topic #1:
['people', 'students', 'threats', 'high', 'county', 'police', 'clowns', 'said', 'school', 'clown']


Top 10 words for topic #2:
['people', 'threat', 'schools', 'reports', 'threats', 'clowns', 'clown', 'said', 'school', 'police']


Top 10 words for topic #3:
['carolina', 'reported', 'told', 'greenville', 'children', 'woods', 'clowns', 'said', 'police', 'clown']


Top 10 words for topic #4:
['urging', 'rn', 'yo', 'firearm', 'precautions', 'minds', 'responsibility', 'lookout', 'suburb', 'learning']


Top 10 words for topic #5:
['children', 'year', 'sightings', 'says', 'reports', 'halloween', 'people', 'creepy', 'clowns', 'clown']


Top 10 words for topic #6:
['media', 'threats', 'like', 'police', 'sightings', 'creepy', 'said', 'people', 'clowns', 'clown']


Top 10 words for topic #7:
['people', 'permalinkembedsavereportgive', 'just', 'point', 'clown', 'p

In [25]:
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(182, 10)

In [26]:
df['Topic'] = topic_values.argmax(axis=1)

In [27]:
df.head(10)

Unnamed: 0,Title,Date,Author,Origin,URL,Text,Topic
0,Clown Attack on Woman Forces Cincinnati Suburb...,September 30 2016,,News Report,http://insider.foxnews.com/2016/09/30/clown-at...,An Ohio school district closed schools today a...,2
1,Another Clown Was Spotted In The Woods And Pol...,September 6 2016,Michelle Broder Van Dyke,News Report,https://www.buzzfeed.com/mbvd/stop-clowning-ar...,The latest clown to be spotted was chased back...,3
2,Everyone in poor moiuntain please stay inside....,September 13 2016,Melissa Dooley,Facebook,https://www.facebook.com/melissa.dooley.397/po...,I don't know if this is real or fake. I didn't...,7
3,He's the hero this country deserves,October 13 2016,The LAD Bible,Facebook,https://www.facebook.com/LADbible/videos/29391...,"Batman, ""As for you clowns, if you want to sca...",0
4,After-dark clown sightings trouble California ...,October 13 2014,"The Associated Press , WBIR",News Report,http://www.wbir.com/news/after-dark-clown-sigh...,"BAKERSFIELD, California (AP) — People dressed ...",8
5,The tears of a (real) clown: All the insane cl...,October 30 2016,Ellen McCarthy,News Report,https://www.washingtonpost.com/lifestyle/the-t...,"Listen, punks. You think you can slap on some ...",6
6,Target halts clown-mask sales amid creepy-clow...,October 17 2016,Amy B Wang,News Report,https://www.washingtonpost.com/news/arts-and-e...,Send out the clowns.\n\nTarget will stop selli...,6
7,Creepy clowns spook a country already freaked ...,September 29 2016,Petula Dvorak,News Report,https://www.washingtonpost.com/local/creepy-cl...,Turns out that people with a lot of face makeu...,6
8,No laughing matter: Clown sightings have polic...,September 24 2016,WPCO Staff,News Report,http://www.wcpo.com/news/state/state-kentucky/...,Your worst nightmare may be coming true in Ken...,8
9,Clown sightings creep into southern Kentucky,September 19 2016,WKYT Staff,News Report,http://www.wkyt.com/content/news/Clown-sightin...,"LONDON, Ky. (WKYT) - After several sighting in...",5


## NMF

It is possible to call the texts straight from the dataframe, with `df['Text'].values.astype('U')` replacing `legends` as in:

```
doc_term_matrix = tfidf_vect.fit_transform(df['Text'].values.astype('U'))
```

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=2, stop_words='english')
doc_term_matrix = tfidf_vect.fit_transform(legends)

In [30]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10)
nmf.fit(doc_term_matrix )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=10, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [47]:
print(f'Top 10 words per topic:\n')
for i,topic in enumerate(nmf.components_):
    print(f'#{i}:', end=" ")
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])

Top 10 words per topic:

#0: ['sightings', 'king', 'scary', 'like', 'hysteria', 'says', 'creepy', 'people', 'clowns', 'clown']
#1: ['clown', 'clowns', 'just', 'point', 'permalinkembedsaveparentreportgive', 'permalinkembedsavereportgive', 'points', 'ago', 'months', 'goldreply']
#2: ['apartments', 'said', 'complex', 'children', 'manor', 'clowns', 'apartment', 'fleetwood', 'woods', 'greenville']
#3: ['district', 'said', 'clown', 'students', 'high', 'police', 'threat', 'schools', 'threats', 'school']
#4: ['hunt', 'twitter', 'clowns', 'college', 'campus', 'clown', 'nelson', 'students', 'state', 'penn']
#5: ['winston', 'salem', 'red', 'north', 'children', 'woods', 'greensboro', 'police', 'carolina', 'clown']
#6: ['think', 'jamie', 'ky', 'month', 'deputies', 'clown', 'picture', 'didn', 'hill', 'london']
#7: ['know', 'didn', 'dude', 'don', 'fucking', 'went', 'house', 'just', 'said', 'henry']
#8: ['boy', 'incident', 'columbus', 'reports', 'tuesday', 'wearing', 'mask', 'clown', 'said', 'police']