# Parsing Madness:
## Analyzing linguistic and thematic patterns in QAnon 'drops'
## An exercise in web scraping, dataframe creation and management, and natural language processing
##### beautiful soup -> pandas dataframe
##### request -> soup -> imgs and text -> pandas dataframe
##### text -> cleaned -> tokenized -> remove stop words -> features dictionary

In [4]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as mpim
import requests
import itertools
import re
import string 
import nltk
import os
import shutil
import urllib.request
import pandas as pd
import numpy as np
import datetime as dt
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from bs4 import BeautifulSoup
from itertools import islice, zip_longest 
from skimage import io
from nltk.corpus import stopwords
from nltk import RegexpParser, Tree
from nltk.util import ngrams
from urllib.request import Request, urlopen
from IPython.display import Image
from collections import Counter

In [5]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

## Scraping and Parsing HTML with BeautifulSoup

In [116]:
base_url = 'https://qposts.online'
# since there are 104 pages total, an object to capture them all is created
urls = ['https://qposts.online/page/{}'.format(i) for i in range(1, 105)]
# a list comprehension requesting each url
requests_urls = [requests.get(url) for url in urls]
# a soup object for all of the requests 
soups = [BeautifulSoup(url.text, 'html.parser') for url in requests_urls]

# the images and content of message will serve as the data for the dataframe
all_images = [soup.findAll('img') for soup in soups]
all_messages = [soup.findAll('div', 'message') for soup in soups]
# the information contained in meta will serve as the dataframe's index
all_meta = [soup.findAll('div', {'class': ['meta', 'lar']}) for soup in soups]

punct = string.punctuation + str('’') + str('“') + str('”') + str('‘') + str('–') + str('…')

In [4]:
all_nums = []
all_times = []
for meta in all_meta:
    for div in meta:
        for span in div.findAll('span', {'class': ['num']}):
            all_nums.append(span.text)
        for span in div.findAll('span', {'class': ['time']}):
            all_times.append(span.text)

all_dates = pd.to_datetime(all_times, unit='s').date
all_hours = pd.to_datetime(all_times, unit='s').time
# some days Q posted multiple times; a multiindex is useful here
all_multi = pd.MultiIndex.from_arrays([all_dates, all_hours], names=['date', 'hour'])


# the website contains inconsistent tag use; the earliest posts use <br>'s
# while the later posts use <p>; this needed to be accounted for with an if statement 
all_total = []
for message in all_messages:
    for msg in message:
        inner = []
        for br in msg.findAll('br'):
            br.replace_with(' ')
        for text in msg.findAll('div', class_='text'):
            if text.next_element.name == 'p':
                for p in text.findAll('p'):
                    inner.append(p.get_text())
            else:
                inner.append(text.get_text())
        for img in msg.findAll('img'):
            inner.append(base_url+img['data-src'])
        all_total.append([i for i in inner if i])

In [11]:
only_text = []
for message in all_messages:
    for msg in message:
        inner = []
        for br in msg.findAll('br'):
            br.replace_with(' ')
        for text in msg.findAll('div', class_='text'):
            if text.next_element.name == 'p':
                for p in text.findAll('p'):
                    inner.append(p.get_text())
        only_text.append([i for i in inner if i])

In [35]:
def text_cleaning(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[[\]]', '', text)
    text = re.sub('[%s]' % re.escape(punct), '', text)
    text = re.sub(r'\\u2002', '', text)
    text = re.sub(r'\u2002', '', text)
    text = re.sub(r'>>\d+', '', text)
    text = re.sub(r'\w+.jp(e*)g|\w+.png', '', text)
    text = re.sub(r'\t', '', text)
    text = re.sub(r'…|_|!', '', text)
    text = re.sub(r'\d', '', text)
    return text

In [7]:
with open('drops.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(all_total, f, pickle.HIGHEST_PROTOCOL)

## pandas dataframe

In [6]:
q_df = pd.DataFrame({'number': all_nums, 'q_drop': all_total}, index=all_multi)
q_df['number'] = q_df['number'].astype('float')
q_df['q_drop'] = q_df['q_drop'].astype('str')
pd.set_option('display.max_colwidth', None)  

def search(query):
    query_df = q_df.loc[q_df.q_drop.str.contains(query)].copy()
    return query_df

In [8]:
q_df.to_csv('q_drops.txt')

In [7]:
q_df = pd.read_csv('q_drops.txt')

In [29]:
q_df[q_df.q_drop.str.contains('https')]

Unnamed: 0,date,hour,number,q_drop
0,2020-12-08,22:05:50,4953.0,['https://www.youtube.com/watch?v=O1l-nR1Apj4'...
2,2020-11-13,03:20:17,4951.0,"['Shall we play a game?', '[N]othing [C]an [S]..."
3,2020-11-13,02:32:39,4950.0,"['Nothing can stop what is coming.', 'Nothing!..."
4,2020-11-03,06:27:36,4949.0,['https://www.youtube.com/watch?v=9tjdswqGGVg&...
5,2020-11-02,22:48:50,4948.0,['https://twitter.com/TimMurtaugh/status/13233...
...,...,...,...,...
4874,2017-11-05,04:16:50,79.0,['Graphic is right. Add above points to graphi...
4878,2017-11-05,04:14:37,75.0,['By the time POTUS returns from his trip the ...
4902,2017-11-02,18:12:06,51.0,['>>147642589one-nation-under-god-t-shirt_desi...
4925,2017-11-01,05:59:01,28.0,"["">>147450119Spy.png>>147441102 >What must be ..."


In [93]:
images_from_pickle = re.findall(r'https://qposts\S+(?:jpg|jpeg|png)', str(drops_pickled))
links_from_pickle = re.findall(r'\bhttps?://(?!\S+(?:jpe?g|png))\S+', str(drops_pickled))

In [30]:
with open('drops.pickle', 'rb') as f:
    drops_pickled = pickle.load(f)

### Instead of requesting and parsing soup objects each time, we can pickle outputs for optimization

In [8]:
#list(itertools.chain.from_iterable(total_pickled))

### method for cleaning a pickled outpit

In [142]:
pickled_only_text = []
for drop in drops_pickled:
    for line in drop:
        if line.startswith('https') or line.startswith('>>') or line == 'Q' or line.startswith('>'):
            pass
        else:
            line = re.sub('[%s]' % re.escape(punct), '', line)
            line = re.sub(r'https?\w+', '', line)
            line = re.sub(r'\d(th)?', '', line)
            line = re.sub(r'\u2002', ' ', line)
            line = re.sub(r'www\w+', '', line)
            line = line.lower()
            if not line.strip():
                pass
            else:
                pickled_only_text.append(line.strip())

In [143]:
with open('cleaned_text.pickle', 'wb') as f:
    pickle.dump(pickled_only_text, f, pickle.HIGHEST_PROTOCOL)

In [144]:
with open('cleaned_text.pickle', 'rb') as f:
    cleaned_text = pickle.load(f)

In [153]:
tokens = [nltk.word_tokenize(text) for text in cleaned_text]

## Natural Language Processing

**steps for cleaning data**:
* concact text into one string
* make the text lowercase
* remove punctuation 
* remove urls
* remove numbers

In [43]:
def text_to_bow(some_text):
    bow_dictionary = {}
    for text in some_text:
        if text in bow_dictionary:
            bow_dictionary[text] += 1
        else:
            bow_dictionary[text] = 1
    return bow_dictionary

In [44]:
flat_cleaned_text = [text for text in cleaned_text if text and text != 'Q']

In [45]:
stopped = [nltk.word_tokenize(tokens) for tokens in flat_cleaned_text if tokens not in stop_words]
tagged = [nltk.pos_tag(stop) for stop in stopped]

## Bag of Words dictionary 

In [56]:
bow_dict = text_to_bow(stopped_flat_words)

In [20]:
Counter(stopped_flat_words).most_common(50)

[('q', 401),
 ('people', 389),
 ('news', 319),
 ('us', 300),
 ('think', 297),
 ('potus', 284),
 ('control', 283),
 ('public', 266),
 ('treason', 232),
 ('media', 226),
 ('fake', 209),
 ('one', 195),
 ('comey', 195),
 ('narrative', 185),
 ('2', 180),
 ('would', 180),
 ('brennan', 171),
 ('election', 169),
 ('1', 165),
 ('see', 163),
 ('time', 163),
 ('power', 159),
 ('truth', 152),
 ('party', 146),
 ('wwg1wga', 145),
 ('w', 144),
 ('political', 143),
 ('james', 143),
 ('fbi', 141),
 ('president', 141),
 ('china', 138),
 ('mueller', 134),
 ('new', 130),
 ('post', 130),
 ('john', 130),
 ('must', 129),
 ('ds', 129),
 ('know', 128),
 ('coming', 122),
 ('believe', 120),
 ('many', 119),
 ('happens', 116),
 ('attack', 114),
 ('united', 114),
 ('push', 112),
 ('anons', 112),
 ('state', 109),
 ('ca', 109),
 ('god', 108),
 ('last', 107)]

In [21]:
def list_of_NPS(sentences):
    patterns = """
        NP: {<JJ.*>+<NN.*>+}
        """
    NPChunker = nltk.RegexpParser(patterns)
    nps = []
    for sent in sentences:
        tree = NPChunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP':
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
    return nps

In [22]:
def list_of_VPS(sentences):
    patterns = """VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}"""
    VPChunker = nltk.RegexpParser(patterns)
    vps = []
    for sent in sentences:
        tree = VPChunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'VP':
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                vps.append(t)
    return vps

In [23]:
Counter(list_of_VPS(tagged)).most_common(50)



[('Enjoy the show', 28),
 ('are the news now', 18),
 ('was a time', 14),
 ('is power', 13),
 ('Watch the news', 12),
 ('is a reason', 11),
 ('is everything', 11),
 ('re a desired topic', 11),
 ('Ds push', 10),
 ('leading the attack', 10),
 ('expend ammunition', 9),
 ('organized use', 9),
 ('See something', 9),
 ('evinces a design', 8),
 ('challenge political control', 8),
 ('is a philosophical viewpoint', 8),
 ('regarding truth', 8),
 ('Follow the money', 8),
 ('make the connection', 8),
 ('pose no threat', 7),
 ('TELL the public', 7),
 ('regain power', 7),
 ('becomes a threat', 7),
 ('control the narrative andor', 7),
 ('have free thought', 7),
 ('is the only way forward', 7),
 ('think this war', 7),
 ('considered a major threat', 7),
 ('is a direct attack', 7),
 ('embedded link', 7),
 ('was clear verify', 7),
 ('launch a massive domestic foreign surv', 7),
 ('elect president', 7),
 ('verified report', 7),
 ('considering the funding', 7),
 ('was the opposition', 7),
 ('granted auth', 

In [61]:
Counter(list_of_NPS(tagged)).most_common(50)



[('Logical thinking', 49),
 ('GREAT AWAKENING', 18),
 ('only way', 15),
 ('full armor', 15),
 ('desired topic', 11),
 ('first time', 9),
 ('American people', 9),
 ('last night', 9),
 ('original mandate', 9),
 ('many people', 8),
 ('public exposure', 8),
 ('new Government', 8),
 ('such principles', 8),
 ('such form', 8),
 ('long train', 8),
 ('same Object', 8),
 ('such Government', 8),
 ('new Guards', 8),
 ('political control', 8),
 ('Democratic Former Mayor', 8),
 ('social media', 8),
 ('Free thought', 8),
 ('philosophical viewpoint', 8),
 ('logic reason', 8),
 ('foreign entity', 8),
 ('highest levels', 7),
 ('mathematical probability', 7),
 ('greatest fear', 7),
 ('transient causes', 7),
 ('old girl', 7),
 ('deep state', 7),
 ('nondogmatic ’ information', 7),
 ('narrative andor', 7),
 ('stable ‘ groupthink ’ collective', 7),
 ('free thought ’', 7),
 ('future events', 7),
 ('dark world', 7),
 ('spiritual forces', 7),
 ('devil ’ s schemes', 7),
 ('’ t', 7),
 ('public opinion', 7),
 ('il

In [24]:
Counter([word.lower() for word in re.findall('\w+ly', str(tagged))]).most_common(20)

[('only', 133),
 ('family', 50),
 ('really', 45),
 ('carefully', 40),
 ('simply', 38),
 ('knowingly', 33),
 ('directly', 28),
 ('daily', 27),
 ('early', 24),
 ('immediately', 20),
 ('mathematically', 20),
 ('actually', 19),
 ('publicly', 19),
 ('rally', 18),
 ('currently', 18),
 ('effectively', 17),
 ('july', 16),
 ('probably', 16),
 ('apply', 15),
 ('actively', 15)]

In [42]:
Counter(filter(lambda x: (x[1] == 'RB'), flat_tagged)).most_common(50)

[(('not', 'RB'), 382),
 (('now', 'RB'), 132),
 (('very', 'RB'), 91),
 (('just', 'RB'), 85),
 (('only', 'RB'), 85),
 (('here', 'RB'), 78),
 (('still', 'RB'), 70),
 (('so', 'RB'), 68),
 (('never', 'RB'), 58),
 (('always', 'RB'), 55),
 (('then', 'RB'), 55),
 (('prior', 'RB'), 55),
 (('again', 'RB'), 54),
 (('no', 'RB'), 53),
 (('back', 'RB'), 51),
 (('Sometimes', 'RB'), 50),
 (('really', 'RB'), 43),
 (('carefully', 'RB'), 39),
 (('ever', 'RB'), 39),
 (('too', 'RB'), 37),
 (('simply', 'RB'), 36),
 (('also', 'RB'), 34),
 (('already', 'RB'), 34),
 (('well', 'RB'), 33),
 (('long', 'RB'), 32),
 (('Now', 'RB'), 30),
 (('soon', 'RB'), 30),
 (('So', 'RB'), 29),
 (('longer', 'RB'), 28),
 (('directly', 'RB'), 27),
 (('even', 'RB'), 26),
 (('right', 'RB'), 25),
 (('far', 'RB'), 24),
 (('yet', 'RB'), 24),
 (('as', 'RB'), 24),
 (('there', 'RB'), 23),
 (('Not', 'RB'), 22),
 (('ago', 'RB'), 22),
 (('instead', 'RB'), 22),
 (('Only', 'RB'), 20),
 (('immediately', 'RB'), 20),
 (('forward', 'RB'), 19),
 (('