In [1]:
import nltk
import pickle
import regex as re
import numpy as np
import pandas as pd

from string import punctuation, digits
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = nltk.corpus.wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  cleaned = re.sub('/', ' ', cleaned)
  cleaned = re.sub('-', ' ', cleaned)
  cleaned = re.sub('_', ' ', cleaned)
  cleaned = re.sub('\]\[', ' ', cleaned)
  cleaned = re.sub('…', '', cleaned)
  cleaned = re.sub('[%s]' % re.escape(punctuation), '', cleaned)
  cleaned = re.sub('[%s]' % re.escape(digits), '', cleaned)
  tokenized = nltk.word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized if token not in stop_words])
  return normalized

def text_to_bow(some_text):
    bow_dictionary = {}
    for text in some_text:
        if text in bow_dictionary:
            bow_dictionary[text] += 1
        else:
            bow_dictionary[text] = 1
    return bow_dictionary


stop_words = set(stopwords.words("english"))

### 'Q' is treated as a stopword, since it is used in the text as a signature, and would only obscure our topics later on. The frequency of its appearance doesn't tell us much. 

In [2]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [3]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

In [4]:
replacement_dict = {' w ': 'with',
 '&': 'and',
 'AUTH': 'authorization',
 'BRENNAN': 'John Brennan',
 'COVID19': 'covid',
 'C19': 'covid',
 'D': 'Democrats',
 'D.': 'Democrat.',
 'GOOG': 'Google',
 "Gov't": 'government',
 'HUSSEIN': 'Barack Obama',
 'Hussein': 'Barack Obama',
 'ID': 'identification',
 'NK': 'North Korea',
 'KERRY': 'John Kerry',
 "M's": 'marshalls',
 "M’s": 'marshalls',
 'MI': 'military intelligence',
 "_Hussein's": "Obama's",
 'MERKEL': 'Angela Merkel',
 'MS-13': 'ms thirteen',
 'MSM': 'mainstream media',
 'No Such Agency': 'NSA',
 'Russia>D': 'Russia Democrats',
 'SA': 'Saudi Arabia',
 'U S Gov t': 'United States government',
 'U.S.': 'United States',
 'US': 'United States',
 'US-owned': 'United States owned',
 '[D]': 'Democratic',
 'comms': 'communications',
 "d's": 'democrats',
 'SC': 'Supreme Court',
 'NG': 'National Gaurd',
 'ds': 'democrats',
 'gov.': 'government',
 'financial T': 'financial transactions',
 'r v d': 'republicans vs democrats',
 'R': 'Republican',
 "r's": 'republicans',
 're': 'regarding',
 'rs': 'republicans',
 'v': 'versus',
 'v2': 'version two',
 '[N]othing': 'Nothing',
 '[C]an': 'Can',
 '[S]top': 'Stop',
 '[W]hat': 'What',
 '[I]s': 'Is',
 '[C]oming': 'Coming',
 'w/': 'with'}

In [5]:
%%time 
with open('drop_contents.pkl', 'rb') as f:
    drop_df = pickle.load(f)

CPU times: user 4.96 s, sys: 280 ms, total: 5.24 s
Wall time: 5.29 s


In [6]:
exploded_df = pd.DataFrame(drop_df.drop_contents.explode())
exploded_df.reset_index(inplace=True)
exploded_df.rename(columns={'index': 'drop_number'}, inplace=True)
exploded_df['drop_contents'] = exploded_df['drop_contents'].astype('str')
exploded_df.rename(columns={'drop_number': 'number', 'drop_contents': 'contents'}, inplace=True)

In [9]:
exploded_df.drop(exploded_df[exploded_df.contents == 'Q'].index, inplace=True)
exploded_df.drop(exploded_df[exploded_df.contents.str.contains('https')].index, inplace=True)
exploded_df.drop(exploded_df[exploded_df.contents.str.contains('www')].index, inplace=True)
exploded_df.drop(exploded_df[exploded_df.contents.str.contains('http')].index, inplace=True)
exploded_df.reset_index(drop=True, inplace=True)

In [53]:
exploded_df.groupby('number')['contents']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x133232e80>

In [11]:
drop_list = [item for item in drop_df['drop_contents'].explode().tolist() if item != 'Q']

In [12]:
def parse():
    inner = []
    for item in drop_list:
        if item is np.nan:
            continue
        else:
            inner.append(item.split())
    return inner

In [13]:
def swap():
    inner = []
    for nested_list in [line.split() for line in no_https]:
        for item in nested_list:
            if item in replacement_dict.keys():
                item = replacement_dict.get(item)
            inner.append(item)
    return ' '.join(inner)

In [14]:
uncleaned = [i for i in drop_list if i is not float and i is not np.nan]

In [61]:
def split():
    inner = []
    for item in uncleaned[-5].split():
        if item in replacement_dict:
            item = replacement_dict.get(item)
        inner.append(item)
    joined = ' '.join(inner)
    return joined

In [16]:
no_https = [line for line in uncleaned if not line.startswith('http') and not line.startswith('www')]

In [17]:
len(nltk.sent_tokenize(swap()))

22046

In [18]:
len(uncleaned)

33680

In [50]:
processed = [preprocess_text(line) for line in nltk.sent_tokenize(swap())]

In [51]:
processed_2x = [line for line in processed if line and line != 'q']

In [23]:
counter = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
matrix = counter.fit_transform(processed_2x)
features = counter.get_feature_names()
freq_df = pd.DataFrame(matrix.T.todense(), index=features)

In [25]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(processed_2x)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
        for i in topic.argsort()[:-top_n - 1:-1]])



In [26]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('control', 574.2427645468977), ('power', 338.3684717373525), ('public', 276.24734111985975), ('world', 245.77370262129702), ('narrative', 236.39307390151), ('twitter', 217.80173874125762), ('saudi', 210.21334393334828), ('arabia', 205.10459361085688), ('define', 185.30617924349767), ('event', 152.24506262945732)]
Topic 1:
[('people', 635.9224299427209), ('past', 180.94338568892243), ('mean', 168.60423777856002), ('com', 151.55897279639476), ('push', 147.83491362103854), ('session', 147.64692073452576), ('ask', 142.63970040811083), ('divide', 133.218608371683), ('conspiracy', 119.6574247738767), ('win', 117.83390152710842)]
Topic 2:
[('state', 715.8154774077325), ('unite', 536.309805725481), ('stand', 181.51423289532394), ('year', 176.54376214560924), ('release', 148.740531215453), ('money', 147.06078579498012), ('central', 146.3867203340498), ('america', 139.16353853583408), ('target', 131.6970108170917), ('want', 128.82972037973786)]
Topic 3:
[('bank', 248.049659

In [27]:
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('bank', 0.86868684663398), ('central', 0.4471905846357985), ('national', 0.11816108094003672), ('state', 0.11093215248203561), ('republic', 0.07944782910382073), ('reserve', 0.06106599838767813), ('west', 0.045553307792222844), ('authority', 0.04313205364036145), ('new', 0.021701156945097267), ('unite', 0.021500736180669663)]
Topic 1:
[('republican', 0.7935404325704407), ('state', 0.3055639521538906), ('democrat', 0.30269972259363603), ('unite', 0.290468073608149), ('house', 0.19648584096281904), ('john', 0.08522188196539798), ('freedom', 0.07689499079446668), ('senate', 0.06956600932532613), ('bob', 0.04869925341119732), ('yes', 0.048458284199507054)]
Topic 2:
[('freedom', 0.6980463406919409), ('yes', 0.4424769897868514), ('conf', 0.348712219288241), ('ex', 0.32636006892017316), ('stand', 0.2607049094650494), ('change', 0.047624143773137506), ('leadership', 0.046349298798787845), ('red', 0.03497956933099308), ('mod', 0.03095162008265343), ('good', 0.0206653020495

In [28]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

NMF Model:
Topic 0:
[('bank', 10.251877714221937), ('central', 5.27791108955629), ('national', 1.385554530223205), ('state', 1.1044476329442738), ('republic', 0.9371318026843228), ('reserve', 0.7165100772050804), ('west', 0.5376965740686608), ('authority', 0.5084865237929521), ('new', 0.2484751305093545), ('canada', 0.23866521883975023)]
Topic 1:
[('republican', 7.083265851483288), ('democrat', 2.6888772068958913), ('house', 1.6207386792197804), ('john', 0.6808518221203896), ('unite', 0.661383424313702), ('senate', 0.5257028925852318), ('state', 0.4764339838000655), ('bob', 0.4383083943884079), ('ryan', 0.3608308805130569), ('jr', 0.3429675365667554)]
Topic 2:
[('freedom', 5.567682963736523), ('yes', 3.529803802856973), ('conf', 2.7816764041255517), ('ex', 2.6036069731767126), ('stand', 2.0792841171055594), ('change', 0.3789708415762283), ('leadership', 0.369671486160153), ('red', 0.28264082863107687), ('mod', 0.2465748246635706), ('good', 0.16367635622706608)]
Topic 3:
[('ceo', 5.8097

In [39]:
tfidf_vec = TfidfVectorizer(min_df=5, max_df=0.9)

tfidf_matrix = tfidf_vec.fit_transform(processed_2x)
tfidf_features = tfidf_vec.get_feature_names()

tfidf_df = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf_features)

In [40]:
tfidf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21629,21630,21631,21632,21633,21634,21635,21636,21637,21638
ab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abedin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ability,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
able,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
youtube,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
len(tfidf_features)

2901

In [57]:
uncleaned[2].split()

['[N]othing', '[C]an', '[S]top', '[W]hat', '[I]s', '[C]oming']

In [70]:
uncleaned[-5].split()

['extradition',
 'already',
 'in',
 'motion',
 'effective',
 'yesterday',
 'with',
 'several',
 'countries',
 'in',
 'case',
 'of',
 'cross',
 'border',
 'run.',
 'Passport',
 'approved',
 'to',
 'be',
 'flagged',
 'effective',
 '10/30',
 '@',
 '12:01am.',
 'Expect',
 'massive',
 'riots',
 'organized',
 'in',
 'defiance',
 'and',
 'others',
 'fleeing',
 'the',
 'US',
 'to',
 'occur.',
 'US',
 'M’s',
 'will',
 'conduct',
 'the',
 'operation',
 'while']

In [75]:
split()

'extradition already in motion effective yesterday with several countries in case of cross border run. Passport approved to be flagged effective 10/30 @ 12:01am. Expect massive riots organized in defiance and others fleeing the United States to occur. United States marshalls will conduct the operation while'

In [33]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(tfidf_matrix.T)

df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'] = documents_2d[:,0]
df['y'] = documents_2d[:,1]
df['document'] = tfidf_features

source = ColumnDataSource(ColumnDataSource.from_df((df)))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                    text_font_size='8pt', text_color="#555555",
                    source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)