# Parsing Madness:
## Analyzing linguistic and thematic patterns in QAnon 'drops'
## An exercise in web scraping, dataframe creation and management, and natural language processing
##### beautiful soup -> pandas dataframe
##### request -> soup -> imgs and text -> pandas dataframe
##### text -> cleaned -> tokenized -> remove stop words -> features dictionary

In [1]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as mpim
import requests
import itertools
import re
import string 
import nltk
import os
import shutil
import urllib.request
import pandas as pd
import numpy as np
import datetime as dt
import pickle
import networkx as nx

from string import digits
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from bs4 import BeautifulSoup
from itertools import islice, zip_longest 
from skimage import io
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams
from urllib.request import Request, urlopen
from IPython.display import Image
from collections import Counter

stop_words = set(stopwords.words('english'))
punct = string.punctuation + str('’') + str('“') + str('”') + str('‘') + str('–') + str('…')

normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = nltk.corpus.wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = nltk.word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized if token not in stop_words])
  return normalized

def text_to_bow(some_text):
    bow_dictionary = {}
    for text in some_text:
        if text in bow_dictionary:
            bow_dictionary[text] += 1
        else:
            bow_dictionary[text] = 1
    return bow_dictionary

In [2]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness/.ipynb_checkpoints'

In [3]:
os.chdir('/Users/kylereaves/Documents/GitHub/parsing_madness/')

In [4]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

In [5]:
with open('drops.pickle', 'rb') as f:
    drops_pickled = pickle.load(f)

images_from_pickle = re.findall(r'https://qposts\S+(?:jpg|jpeg|png)', str(drops_pickled))
links_from_pickle = re.findall(r'\bhttps?://(?!\S+(?:jpe?g|png))\S+', str(drops_pickled))

In [8]:
pickled_only_text = []
for drop in drops_pickled:
    inner = ''
    for line in drop:
        if line.startswith('https') or line.startswith('>>') or line == 'Q' or line.startswith('>'):
            pass
        else:
            #line = re.sub('[%s]' % re.escape(punct), '', line)
            line = re.sub(r'\n', ' ', line)
            line = re.sub(r'https?\w+', ' ', line)
            line = re.sub(r'\d(th)?', ' ', line)
            #line = re.sub(r'\u2002', ' ', line)
            line = re.sub(r'www\w+', ' ', line)
            line = line.lower()
            if not line:
                pass
            else:
                inner += ' '.join(line.split())
    pickled_only_text.append(inner)

In [6]:
q_df = pd.read_csv('q_drops.txt')

In [7]:
q_df

Unnamed: 0,date,hour,number,q_drop
0,2020-12-08,22:05:50,4953.0,['https://www.youtube.com/watch?v=O1l-nR1Apj4'...
1,2020-11-13,05:20:55,4952.0,"['Durham.', 'Q']"
2,2020-11-13,03:20:17,4951.0,"['Shall we play a game?', '[N]othing [C]an [S]..."
3,2020-11-13,02:32:39,4950.0,"['Nothing can stop what is coming.', 'Nothing!..."
4,2020-11-03,06:27:36,4949.0,['https://www.youtube.com/watch?v=9tjdswqGGVg&...
...,...,...,...,...
4948,2017-10-29,16:47:18,5.0,"['Follow the money, it’s the key. What is Pelo..."
4949,2017-10-29,16:11:40,4.0,"['Some of us come here to drop crumbs, just cr..."
4950,2017-10-29,15:47:07,3.0,['Open your eyes. It finally came out that Rod...
4951,2017-10-28,22:15:48,2.0,"['Mockingbird HRC detained, not arrested (yet..."


In [9]:
outter_list = []
for i in range(0, len(q_df.q_drop)):
    inner = []
    text = q_df.q_drop[i]
    text = re.sub(r'https?\S+[\w$]', '', text)
    text = re.sub(r'>>\S+[png$]', '', text)
    text = re.sub(r'>>\S+[\d$]', '', text)
    text = re.sub('\[', '', text)
    text = re.sub('\]', '', text)
    text = re.sub('/', ' ', text)
    text = re.sub('\?', '', text)
    text = re.sub('\(', '', text)
    text = re.sub('\)', '', text)
    text = re.sub('\!', '', text)
    text = re.sub('>', '', text)
    text = re.sub('…', '', text)
    text = re.sub(r'\u2002\u2002', ' ', text)
    text = re.sub('\.', '', text)
    text = re.sub('\*', '', text)
    text = re.sub('_', ' ', text)
    text = re.sub('C19', 'covid', text)
    text = re.sub('\$', '', text)
    text = re.sub(r'\d\S+', '', text)
    text = text.lower()
    outter_list.append(text)

In [16]:
q_drop_list = []
for i in q_df.q_drop:
    sent_tokens = nltk.sent_tokenize(i)
    cleaned_sents = [re.sub(r'https?\S+', '', token.lower()) for token in sent_tokens]
    cleaned_sents = [re.sub(r'[%s]' % re.escape(punct), '', sent.strip()) for sent in cleaned_sents]
    q_drop_list.append(cleaned_sents)

In [28]:
q_drop_list

AttributeError: 'list' object has no attribute 'strip'

In [24]:
edited_list = []
for q_list in q_drop_list:
    for item in q_list:
        if item != 'q':
            edited_list.append(item.strip())

In [30]:
[item for item in edited_list if item != 'q' and item]

['durham',
 'shall we play a game',
 'nothing can stop what is coming ncswic  who stepped down today forced',
 'more coming',
 'why is this relevant',
 'how do you show the public the truth',
 'how do you safeguard us elections postpotus',
 'how do you remove foreign interference and corruption and install usowned voter id laws and other safeguards',
 'it had to be this way',
 'sometimes you must walk through the darkness before you see the light',
 'nothing can stop what is coming',
 'nothing',
 'that this nation under god shall have a new birth of freedom  and that government of the people by the people for the people shall not perish from the earth',
 'abraham lincoln nov 1863  together we win',
 'are you ready to take back control of this country',
 'are you ready to hold the political elite protected accountable',
 'are you ready to finish what we started',
 'nothing can stop what is coming is not just a catchphrase',
 'fact checkers created in effort to reinforce propaganda diges

In [225]:
for item in drops_pickled[-50][0].splitlines():
    total_words = item.split()
    word_count = len(total_words)
    tokens = []
    new_drop = []
    string = []
    for word in total_words:
        if word[0] == '>':
           continue
        if word[0] in digits:
            continue
        if word[0] == '-':
            continue
        if word[0] == '@':
            continue
        elif word.startswith('http'):
            continue

        else:
            new_drop.append(word.lower())
    for drop in new_drop:
        cleaned = []
        text = re.sub('\)', '', drop)
        text = re.sub('\(', '', text)
        text = re.sub('\?', '', text)
        text = re.sub('\^', '', text)
        text = re.sub('\/', '', text)
        text = re.sub('\!', '', text)
        text = re.sub('_', '', text)
        if text.startswith('.'):
            continue
        else:
            one_string = ' '.join(cleaned)
            cleaned.append(text.strip('.').strip(','))
        tokens.append(' '.join(cleaned))

    string.append(' '.join(tokens))


    #print(sentences)

In [226]:
' '.join(string[0].split())

'who did potus meet with yesterday was ag sessions there how many mi generals were on the wh list to attend a separate meeting could those meetings have been combined why were certain rooms in the wh renovated where was the meeting on monday why aren’t phones allowed in this room one of many what firm was contracted to conduct the renovations'

In [205]:
for word in q_df.q_drop[2].splitlines():
    word = re.sub(r'[^ ]+\.[^ ]+', '', word)
    word = word.lower()
    word = word.replace('[', '')
    word = word.replace(']', '')
    sentence = ''.join(word)
    print(sentence)

'shall we play a game?', 'nothing can stop what is coming', 'ncswic',  'who stepped down today forced?',  'more coming?', 'why is this relevant?', "how do you 'show' the public the truth?", "how do you 'safeguard' us elections post-potus?", "how do you 'remove' foreign interference and corruption and install us-owned voter id law(s) and other safeguards? ", 'it had to be this  'sometimes you must walk through the darkness before you see the light. ', 'q'


In [140]:
with open('cleaned_text.pickle', 'rb') as f:
    cleaned_text = pickle.load(f)

In [12]:
output = []
for sentence in cleaned_text:
    inner = []
    for word in sentence.split():
        if word and word not in stop_words:
            inner.append(word)
    output.append(' '.join(inner))

In [13]:
q_concacted_str = '. '.join(cleaned_text)

In [13]:
stops_removed = [out for out in output if out]

NameError: name 'output' is not defined

In [12]:
q_counter = CountVectorizer()

q_matrix = q_counter.fit_transform(stops_removed)

q_features = q_counter.get_feature_names()

q_freq_df = pd.DataFrame(q_matrix.T.todense(), index=q_features)

NameError: name 'stops_removed' is not defined

In [11]:
q_freq_df.sort_values(by=q_freq_df.columns[-12063], ascending=False)

NameError: name 'q_freq_df' is not defined

In [17]:
q_tokens = [nltk.word_tokenize(text) for text in cleaned_text]
q_tagged = [nltk.pos_tag(token) for token in q_tokens]

In [18]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(stops_removed)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
        for i in topic.argsort()[:-top_n - 1:-1]])

In [19]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('think', 475.4546334165186), ('president', 191.70922845897235), ('follow', 182.98457072148815), ('world', 168.75126259196816), ('coming', 166.6714894564681), ('states', 157.43228993456924), ('united', 154.8755938716277), ('republican', 128.9995724148232), ('divided', 127.63300078967917), ('america', 113.14632196544625)]
Topic 1:
[('public', 380.8690652836086), ('bank', 234.20650388375282), ('truth', 210.66106253692934), ('define', 188.23082224079567), ('treason', 179.16454778087927), ('prevent', 161.65519366378905), ('central', 134.9844272539878), ('hussein', 131.72471313092345), ('events', 110.20795545531416), ('light', 99.94721458298396)]
Topic 2:
[('people', 457.3037778160853), ('time', 265.76906572702495), ('god', 211.1638716062722), ('new', 187.75056948225694), ('happens', 143.2735341362532), ('political', 138.38813524825073), ('order', 123.28362814500073), ('sessions', 119.39518714153989), ('war', 105.23634383079869), ('government', 104.10890332276101)]
Topi

In [20]:
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('bank', 0.8725848340652119), ('central', 0.439200757941844), ('national', 0.12941947044732277), ('states', 0.08637449096116853), ('reserve', 0.0823807525375856), ('republic', 0.07173768042409012), ('west', 0.043642317052234286), ('authority', 0.03778848415834826), ('monetary', 0.03772641227152957), ('new', 0.03140308929340845)]
Topic 1:
[('republican', 0.8672415719140729), ('democrat', 0.31789900231747764), ('house', 0.2612278915116428), ('john', 0.0857208654366074), ('potus', 0.08195329517495725), ('senate', 0.0819473051077891), ('ryan', 0.05045518903910443), ('bob', 0.049568131357534954), ('jr', 0.04942226140841578), ('important', 0.04298433927246466)]
Topic 2:
[('potus', 0.5603981082362045), ('relevant', 0.22280830485755046), ('sa', 0.20005102784533435), ('people', 0.18773788547502435), ('good', 0.16253155255567098), ('think', 0.16242080091491978), ('news', 0.1296568050524482), ('impeach', 0.1261954525803195), ('define', 0.10654700490576882), ('power', 0.105542

In [21]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

NMF Model:
Topic 0:
[('bank', 8.292930869051586), ('central', 4.1711935194198935), ('national', 1.2220240447199209), ('states', 0.8109472653665443), ('reserve', 0.782490151749852), ('republic', 0.6812023221537348), ('west', 0.4147764396559338), ('monetary', 0.3585483550779582), ('authority', 0.3575475704266016), ('new', 0.28409295807666796)]
Topic 1:
[('republican', 6.752527798849936), ('democrat', 2.475289579116315), ('house', 1.9852413482787048), ('john', 0.603710195743942), ('senate', 0.5939927502773127), ('ryan', 0.3865791957209088), ('jr', 0.3834378335107594), ('bob', 0.3824853327905929), ('gowdy', 0.28484524576086034), ('committee', 0.20422155121528932)]
Topic 2:
[('potus', 6.78198500843284), ('good', 2.9597458601105875), ('impeach', 2.7048509264995895), ('sc', 0.503022439724751), ('believe', 0.49536429427114487), ('fight', 0.48865425835711407), ('twitter', 0.4732103476847585), ('mueller', 0.45653129957994926), ('military', 0.4501600077114966), ('attack', 0.35805332030980863)]
To

In [22]:
unseen_text = 'the idea of freedom is impossible of realization because no one knows how to use it with moderation.'
x = nmf_model.transform(vectorizer.transform([unseen_text]))[0]

In [3]:
document_1 = 'It was only in the Reich itself that the "chosen ones" saw nothing of all this. As if stricken with blindness, they walked by the side of the corpse, and in the indications of decomposition they thought they detected signs of "new" life.'

document_2 = 'Out of the temporary evil we are now compelled to commit will emerge the good of an unshakable rule, which will restore the regular course of the machinery of the national life, brought to naught by liberalism. The result justifies the means. Let us, however, in our plans, direct our attention not so much to what is good and moral as to what is necessary and useful.'

document_3 = 'Just as Nature concentrates, not on safeguarding that which exists, but on breeding the coming generation as the representative of the species, so in human life it is less a question of artificially cultivating the existing evils which, human nature being what it is, would be ninety-nine per cent impossible, but rather to assure healthier paths for future development from the start.'

corpus = [document_1, document_2, document_3]

processed_corpus = [preprocess_text(doc) for doc in corpus]

tfidf_vectorizer = TfidfVectorizer(norm=None)

tf_idf_scores = tfidf_vectorizer.fit_transform(processed_corpus)

feature_names = tfidf_vectorizer.get_feature_names()
corpus_index = [n for n in processed_corpus]

df_tf_idf = pd.DataFrame(tf_idf_scores.T.todense(), index=feature_names, columns=corpus_index)

NameError: name 'preprocess_text' is not defined

In [24]:
df_tf_idf

Unnamed: 0,reich chosen one saw nothing stricken blindness walk side corpse indication decomposition think detect sign new life,temporary evil compel commit emerge good unshakable rule restore regular course machinery national life bring naught liberalism result justify mean let u however plan direct attention much good moral necessary useful,nature concentrate safeguard exist breeding come generation representative specie human life less question artificially cultivate exist evil human nature would ninety nine per cent impossible rather assure healthy path future development start
artificially,0.000000,0.000000,1.693147
assure,0.000000,0.000000,1.693147
attention,0.000000,1.693147,0.000000
blindness,1.693147,0.000000,0.000000
breeding,0.000000,0.000000,1.693147
...,...,...,...
think,1.693147,0.000000,0.000000
unshakable,0.000000,1.693147,0.000000
useful,0.000000,1.693147,0.000000
walk,1.693147,0.000000,0.000000


In [2]:
flat = list(itertools.chain.from_iterable(q_tokens))
flat_stops_removed = [f for f in flat if f not in stop_words]

count_vec = CountVectorizer()

doc_2_processed = preprocess_text(document_2)

term_frequencies = count_vec.fit_transform([doc_2_processed])
tf_feature_names = count_vec.get_feature_names()

df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=tf_feature_names, columns=['Term Frequency'])

NameError: name 'itertools' is not defined

In [1]:
df_term_frequencies

NameError: name 'df_term_frequencies' is not defined

In [27]:
nbrs = NearestNeighbors(n_neighbors=10).fit(tf_idf_scores)

def closest_nbrs(word):
    row = df_tf_idf.index.get_loc(word)
    distances, indices = nbrs.kneighbors(tf_idf_scores.getrow(row))
    words_similar = pd.Series(indices.flatten()).map(df_tf_idf.reset_index()['word'])
    result = pd.DataFrame({'distance':distances.flatten(), 'word': words_similar})
    return result

In [28]:
def most_similar(x, model, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), model)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar

In [29]:
similarities = most_similar(x, lsi_Z)
document_id, similarity = similarities[0]

In [30]:
q_corpus = [stops_removed[-1], stops_removed[-2], stops_removed[-3]]

# unnecessary, as text was already processed 
#processed_corpus = [preprocess_text(doc) for doc in corpus]

q_tfidf_vectorizer = TfidfVectorizer(norm=None)

q_tf_idf_scores = q_tfidf_vectorizer.fit_transform(q_corpus)

q_feature_names = q_tfidf_vectorizer.get_feature_names()
q_corpus_index = [n for n in q_corpus]

q_df_tf_idf = pd.DataFrame(q_tf_idf_scores.T.todense(), index=q_feature_names, columns=q_corpus_index)

In [340]:
tfidf_vec = TfidfVectorizer()

q_drops_vectorized = tfidf_vec.fit_transform(stops_removed)
q_drops_features = tfidf_vec.get_feature_names()
q_drops_index = [line for line in stops_removed]

q_drops_df = pd.DataFrame(q_drops_vectorized.T.todense(), index=q_drops_features, columns=q_drops_index)

NameError: name 'stops_removed' is not defined

In [339]:
q_drops_df.sort_values(by=q_drops_df.columns[-1], ascending=False).T

NameError: name 'q_drops_df' is not defined

In [31]:
q_df_tf_idf.sort_values(by=q_df_tf_idf.columns[2], ascending=False)

Unnamed: 0,mockingbird hrc detained arrested yet huma follow huma nothing w russia yet potus surround w generals military intelligence go around letter agencies supreme court case allows use mi v congressional assembled approved agencies ultimate authority branches military wo approval conditions unless wartime conditions military code aw held potus go tv address nation potus must isolate prevent negative optics potus knew removing criminal rogue elements first step essential free pass legislation access everything classified believe hrc soros obama etc power trump fantasy whoever controls office presidecy controls great land never believed moment democrats republicans would lose control r v battle soros donate money recently would place funds rc mockingbird god bless fellow patriots,open eyes finally came rodbob key players uranium scandal dont think potus would tweeting removal given clear conflict potus meet bob cover fbi dir interview bob unable serve dir per law gowdy comments comey history potus everything everyone corrupt fewer think follow huma operation mockingbird priority clean bad actors unite people behind america first agenda many govt worship satan republicans v democrats stage hrc ng called across cities trust president god bless patriots,us come drop crumbs crumbs potus insulated discussion suggesting hes even target false potus addressing nation issues people begin indicted must remain neutral pure optical reasons suggest plan false common sense focus military intellingence state secrets might used vs three letter agency sc decision opened door sitting president activate must showed potus surrounded generals lot good people bad faith hostile takeover evil corrupt network players democrats dont fool thinking obama soros roths clintons etc power present day potus operation mockingbird patriots control sit back enjoy show
potus,4.000000,3.000000,4.000000
false,0.000000,0.000000,3.386294
crumbs,0.000000,0.000000,3.386294
must,1.287682,0.000000,2.575364
people,0.000000,1.287682,2.575364
...,...,...,...
funds,1.693147,0.000000,0.000000
given,0.000000,1.693147,0.000000
go,3.386294,0.000000,0.000000
god,1.287682,1.287682,0.000000


In [32]:
chapter_11 = open('/Users/kylereaves/Desktop/conspiratorial_texts/mein_kampf/11_race_and_nation.txt', 'r')
content = chapter_11.read()

In [33]:
eleven_counter = CountVectorizer()

eleven_matrix = eleven_counter.fit_transform([content])

eleven_features = eleven_counter.get_feature_names()

eleven_df = pd.DataFrame(eleven_matrix.T.todense(), index=eleven_features, columns=['Term Frequencies'])

In [34]:
sentences = nltk.sent_tokenize(content)
chapter_11_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]

In [35]:
processed_sentences = [preprocess_text(sentence) for sentence in sentences]

In [36]:
chapter_11_cleaned = [preprocess_text(sentence) for sentence in sentences]

eleven_count = TfidfVectorizer(norm=None)

chapter_11_vectorized = eleven_count.fit_transform(chapter_11_cleaned)
chapter_11_features = eleven_count.get_feature_names()

chapter_11_df = pd.DataFrame(chapter_11_vectorized.T.todense(), index=chapter_11_features, columns=[sent for sent in chapter_11_cleaned])

In [97]:
q_nbrs = NearestNeighbors(n_neighbors=20).fit(chapter_11_vectorized.T)

row = chapter_11_df.index.get_loc('russia')
distances, indices = q_nbrs.kneighbors(chapter_11_vectorized.T.getrow(row))
names_similar = pd.Series(indices.flatten())
result = pd.DataFrame(data={'distance': distances.flatten(), 'name':names_similar, 'topic': [chapter_11_df.index[i] for i in indices.flatten()]})

In [98]:
result

Unnamed: 0,distance,name,topic
0,0.0,726,ferocity
1,0.0,1642,scribbler
2,0.0,409,crowd
3,0.0,1614,rulership
4,0.0,1054,kill
5,0.0,1774,starve
6,0.0,991,inhuman
7,0.0,1601,robber
8,0.0,1617,russia
9,0.0,1891,thirty


In [40]:
chapter_11_df.sort_values(by=chapter_11_df.columns[-3], ascending=False)

Unnamed: 0,statement truth obvious reason common world see least recognize,time world pass well know truism blindly astonish suddenly somebody discover everybody ought know,columbus egg lie hundred thousand columbus rarely see,thus without exception people wander nature garden think know almost everything yet exception walk blindly one outstanding principle nature work inner seclusion specie live be earth,even superficial observation show almost brazen basic principle countless form expression nature live limit form propagation increase limit,every animal mate representative specie,titmouse seek titmouse finch finch stork stork field mouse field mouse common mouse common mouse wolf wolf etc,exceptional circumstance change first compulsion captivity well impossibility mat within specie,nature begin resist help visible mean visible protest consist either deny bastard procreative faculty limit fertility come offspring case take away capacity resistance disease inimical attack,natural,...,activity condemn unproductiveness sole reason favorable case best saw try fight symptomatic form general sickness pass blindly germ,systematically follow political development old reich bind arrive upon quiet examination realization even time unity thus rise german nation inner decay already way despite apparent political success rise economic wealth general situation become bad year year,even election reichstag outward swell marxist vote announce rapidly approach internal external collapse,success call bourgeois party value unable check increase number marxist flood even call bourgeois electoral victory even harbor ferment deterioration,bourgeois world without know infect cadaveric poison marxist idea resistance originate frequently rather competitive envy ambitious leader rejection principle adversary determine fight extreme,one long year fight imperturbable regularity jew,star david rise high high measure people self preservation vanish,therefore august 1914 people determine attack rush battlefield take place last flare national instinct self preservation face progress pacifist marxist paralyzation national body,even fateful day one recognize internal enemy outward resistance vain providence give reward victorious sword follow law eternal revenge,inner realization form u lead principle well tendency new movement alone conviction enable bring decline german people standstill also create granite foundation upon one day exist state represent mechanism economic consideration interest alien people folkish organism germanic state german nation
national,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.710307,0.0,0.0
1914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.657739,0.0,0.0
rush,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.657739,0.0,0.0
flare,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.252273,0.0,0.0
august,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.252273,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
faint,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
fail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
faculty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.657739,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [51]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [64]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(chapter_11_vectorized.T)

df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], chapter_11_features

source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                    text_font_size='8pt', text_color='#555555',
                    source=source, text_align='center')

plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [86]:
q_documents_2d = svd.fit_transform(q_drops_vectorized.T)

qdf = pd.DataFrame(columns=['x', 'y', 'document'])
qdf['x'], qdf['y'], qdf['document'] = q_documents_2d[:,0], q_documents_2d[:,1], q_drops_features

q_source = ColumnDataSource(ColumnDataSource.from_df(qdf))
q_labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                    text_font_size='8pt', text_color='#555555',
                    source=q_source, text_align='center')

qplot = figure(plot_width=600, plot_height=600)
qplot.circle("x", "y", size=12, source=q_source, line_color="black", fill_alpha=0.8)
qplot.add_layout(q_labels)
show(qplot, notebook_handle=True)

In [117]:
qdf[qdf.document.str.contains('awakening')]

Unnamed: 0,x,y,document
709,0.00496549,0.006292779,awakening
4186,2.960512e-13,7.197815e-14,greatawakening
9574,0.01404358,0.04249221,thegreatawakening


In [88]:
df

Unnamed: 0,x,y,document
0,0.380641,-0.209717,1914
1,0.080712,0.038437,1918
2,0.050968,-0.056008,abate
3,3.578097,0.300698,ability
4,4.225683,0.729187,able
...,...,...,...
2111,0.157034,-0.026401,yield
2112,1.189619,-0.072356,young
2113,0.078419,0.034106,zeitung
2114,0.258819,0.105940,zion
