# Parsing Madness:
## analyzing linguistic and thematic patterns in QAnon 'drops'
## An exercise in web scraping, database creation and management, and natural language processing
##### beautiful soup -> pandas dataframe
##### request -> soup -> imgs and text -> pandas dataframe

In [1]:
from bs4 import BeautifulSoup
from itertools import islice, zip_longest 
from skimage import io
import matplotlib.pyplot as plt
import matplotlib.pyplot as mpim
import requests
import itertools
import re
import string 
import nltk
from nltk.corpus import stopwords
from nltk import RegexpParser, Tree
from nltk.util import ngrams
import shutil
import urllib.request
from urllib.request import Request, urlopen
import os
from IPython.display import Image
import pandas as pd
import numpy as np
import datetime as dt
import pickle
from collections import Counter

In [2]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

In [3]:
base_url = 'https://qposts.online'
# since there are 104 pages total, an object to capture them all is created
urls = ['https://qposts.online/page/{}'.format(i) for i in range(1, 105)]
# a list comprehension requesting each url
requests_urls = [requests.get(url) for url in urls]
# a soup object for all of the requests 
soups = [BeautifulSoup(url.text, 'html.parser') for url in requests_urls]

# the images and content of message will serve as the data for the dataframe
all_images = [soup.findAll('img') for soup in soups]
all_messages = [soup.findAll('div', 'message') for soup in soups]
# the information contained in meta will serve as the dataframe's index
all_meta = [soup.findAll('div', {'class': ['meta', 'lar']}) for soup in soups]

In [4]:
all_nums = []
all_times = []
for meta in all_meta:
    for div in meta:
        for span in div.findAll('span', {'class': ['num']}):
            all_nums.append(span.text)
        for span in div.findAll('span', {'class': ['time']}):
            all_times.append(span.text)

all_dates = pd.to_datetime(all_times, unit='s').date
all_hours = pd.to_datetime(all_times, unit='s').time
# some days Q posted multiple times; a multiindex is useful here
all_multi = pd.MultiIndex.from_arrays([all_dates, all_hours], names=['date', 'hour'])


# the website contains inconsistent tag use; the earliest posts use <br>'s
# while the later posts use <p>; this needed to be accounted for with an if statement 
all_total = []
for message in all_messages:
    for msg in message:
        inner = []
        for br in msg.findAll('br'):
            br.replace_with(' ')
        for text in msg.findAll('div', class_='text'):
            if text.next_element.name == 'p':
                for p in text.findAll('p'):
                    inner.append(p.get_text())
            else:
                inner.append(text.get_text())
        for img in msg.findAll('img'):
            inner.append(base_url+img['data-src'])
        all_total.append([i for i in inner if i])

In [5]:
'''with open('total_pickled.pkl', 'rb') as f:
    total_pickled = pickle.load(f)'''

"with open('total_pickled.pkl', 'rb') as f:\n    total_pickled = pickle.load(f)"

In [6]:
q_df = pd.DataFrame({'number': all_nums, 'q_drop': all_total}, index=all_multi)
q_df['number'] = q_df['number'].astype('float')
q_df['q_drop'] = q_df['q_drop'].astype('str')
pd.set_option('display.max_colwidth', None)  

def search(query):
    query_df = q_df.loc[q_df.q_drop.str.contains(query)].copy()
    return query_df

In [7]:
#list(itertools.chain.from_iterable(total_pickled))

In [8]:
'''names = [name.text for name in soup.findAll('figcaption')]
figures = soup.findAll('figure')
figure_links = [(base_url + figure.a['href']) for figure in figures]'''

"names = [name.text for name in soup.findAll('figcaption')]\nfigures = soup.findAll('figure')\nfigure_links = [(base_url + figure.a['href']) for figure in figures]"

In [10]:
def image_downloader(url, folder):
    try:
        os.makedir(os.path.join(os.getcwd(), folder))
    except:
        pass
    os.chdir(os.path.join(os.getcwd(), folder))
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    images = soup.find_all('img')
    for image in images:
        name = image['alt']
        link = image['src']
        with open(name):
            pass

## function to download images

In [16]:
for figure in figures:
    name = figure.figcaption.text #[:-4]
    link = base_url + figure.a['href']            
    with open(name, 'wb') as f:
        im = requests.get(link)
        if name in os.listdir():
            print('Skipping: {0} is already in directory.'.format(name))
        if name not in os.listdir():
            f.write(im.content)
            print('Writing: ', name)

Skipping: GodBlessAmerica.png is already in directory.
Skipping: largest_flying_flag_in_america.jpg is already in directory.
Skipping: Elf4vVlWoAAJHjQ.jpg is already in directory.
Skipping: Elf4vVoW0AUO87D.jpg is already in directory.
Skipping: Elf4vVkX0AAMM0b.jpg is already in directory.
Skipping: Elf4vVpXUAE04fm.jpg is already in directory.
Skipping: Screen_Shot_2020_10_22_at_6_48_37_PM.png is already in directory.
Skipping: 1603384746326.jpg is already in directory.
Skipping: EJ6EtDcWoAAnhTJ.png is already in directory.
Skipping: EJ6EtDjWsAE4e7H.png is already in directory.
Skipping: EJ6EtDYXYAEr4sZ.png is already in directory.
Skipping: Bishop.jpg is already in directory.
Skipping: David_Bowdich.jpg is already in directory.
Skipping: Haspel.jpg is already in directory.
Skipping: EdWDTBXVcAAwnm0.png is already in directory.
Skipping: Ek4G8urXIAEKasf.jpg is already in directory.
Skipping: EMAqDm9UUAE3wDQ.png is already in directory.
Skipping: ELT3m8uXYAAWTPV.jpg is already in directo

## request and display the links in links

In [15]:
# this loop is specifically for requesting image hyperlinks

"""number = 0
for r in r_s:
    if r.status_code == 200:
        with open(str(number)+'.jpg', 'wb') as f:
            number += 1
            r.raw_decode_content = True
            shutil.copyfileobj(r.raw, f)"""

"number = 0\nfor r in r_s:\n    if r.status_code == 200:\n        with open(str(number)+'.jpg', 'wb') as f:\n            number += 1\n            r.raw_decode_content = True\n            shutil.copyfileobj(r.raw, f)"

In [16]:
# image_url = plt.imread('https://qposts.online/assets/images/86995bcbf3259c3a976db841eebdafac17b46d8b2985574475a60a8608af1f0d.jpg')
# plt.imshow(image_url)
# plt.show()

In [17]:
'''img = mpim.imread('image1')
imgplot = plt.imshow(img)
plt.show()'''

"img = mpim.imread('image1')\nimgplot = plt.imshow(img)\nplt.show()"

In [18]:
'''plt.imshow(io.imread('democrat_party_crumbling_held_up_by_cnn_msnbc_cbs_nyt.jpg'))'''

"plt.imshow(io.imread('democrat_party_crumbling_held_up_by_cnn_msnbc_cbs_nyt.jpg'))"

In [19]:
# [display(Image(file)) for file in file_list]

In [20]:
'''plt.imshow(io.imread('Elf4vVlWoAAJHjQ.jpg'))'''

"plt.imshow(io.imread('Elf4vVlWoAAJHjQ.jpg'))"

## Natural Language Processing

**steps for cleaning data**:
* concact text into one string
* make the text lowercase
* remove punctuation 
* remove urls
* remove numbers

In [9]:
only_text = []
for message in all_messages:
    for msg in message:
        inner = []
        for br in msg.findAll('br'):
            br.replace_with(' ')
        for text in msg.findAll('div', class_='text'):
            if text.next_element.name == 'p':
                for p in text.findAll('p'):
                    inner.append(p.get_text())
        only_text.append([i for i in inner if i])

In [10]:
flat_only_text = list(itertools.chain.from_iterable(only_text))
str_flat_only_text = str(flat_only_text)
punct = string.punctuation

In [11]:
def text_cleaning(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[[\]]', '', text)
    text = re.sub('[%s]' % re.escape(punct), '', text)
    text = re.sub(r'\\u2002', '', text)
    text = re.sub(r'>>\d+', '', text)
    text = re.sub(r'\w+.jp(e*)g|\w+.png', '', text)
    text = re.sub(r'\t', '', text)
    text = re.sub(r'…|_|!', '', text)
    return text

In [12]:
cleaned_text = text_cleaning(str_flat_only_text)
tokens = [word for word in nltk.word_tokenize(cleaned_text) if word not in punct]
pos_tags = nltk.pos_tag(tokens)

In [13]:
cleaned_text = []
for text in only_text:
    for i in text:
        cleaned_text.append(text_cleaning(i))

'Q' is a stopword that would obscure the analysis of the text

In [21]:
flat_cleaned_text = [text for text in cleaned_text if text and text != 'Q']

In [27]:
stopped = [nltk.word_tokenize(tokens) for tokens in flat_cleaned_text]
tagged = [nltk.pos_tag(stop) for stop in stopped]

In [18]:
search('the Storm')

Unnamed: 0_level_0,Unnamed: 1_level_0,number,q_drop
date,hour,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-11-11,23:23:51,3581.0,"['""Calm Before the Storm."" - POTUS', ""Month/Day 'Q' public campaign initiated? "", ""Month/Day 'Durham' initiated?"", 'What famous crime family did Durham target?', '""Also spearheaded mob prosecutions of the [Gambino], Genovese and Patriarca crime families.""', 'http://content.time.com/time/nation/article/0,8599,1918738,00.html', 'What AB[C] agency did Durham target?', 'How are messages sent?', 'https://www.rollingstone.com/culture/culture-news/frank-cali-murder-mafia-boss-qanon-motive-anthony-comello-861777/', '[Dec 12 2018] ', '""What if there\'s another prosecutor (outside of DC) assigned by SESSIONS w/ the same mandate/authority?"" - Q', 'Do you believe in coincidences?', '5:5?', 'Be ready, Patriots.', 'Q']"
2018-12-06,23:56:20,2561.0,"['>>4186896https://thehill.com/opinion/white-house/420131-feds-received-whistleblower-evidence-in-2017-alleging-clinton-foundationRead carefully.Why is ""The Clinton Foundation"" back in the news?Q', 'https://thehill.com/opinion/white-house/420131-feds-received-whistleblower-evidence-in-2017-alleging-clinton-foundation', 'Read carefully.', 'Why is ""The Clinton Foundation"" back in the news?', 'Q', 'When did POTUS make the statement ""Calm Before the Storm?""', 'When was HUBER activated by SESSIONS?', 'Who was/is assigned to HUBER?', 'ACTING AG PRIMARY PURPOSE?', 'SCARAMUCCI MODEL?', 'PUBLIC OPINION (OPTICS) DO NOT MATTER.', 'What was leaked today (on purpose?)?', 'https://twitter.com/johnrobertsFox/status/1070749777334292481', 'HUBER to testify re: Clinton Foundation?', ""HUBER to reveal 'active' probe actively underway into organization?"", ""OIG to release report #2 [overview indicating many 'potentially criminal referrals' made]?"", '""We do not discuss active/ongoing DOJ / FBI investigations.""', 'MIL INTEL', 'FISA', 'THE WORLD IS WATCHING.', 'Q', 'https://thehill.com/opinion/white-house/420131-feds-received-whistleblower-evidence-in-2017-alleging-clinton-foundation', 'Read carefully.', 'Why is ""The Clinton Foundation"" back in the news?', 'Q']"
2018-12-03,22:41:45,2546.0,"['>>4134817>>4134775Okay. Is the plot moving forward? I think we all understand the characters and conflict at this point. Time for the plot twist? Declas, FINALLY?', '>>4134775', 'Okay. Is the plot moving forward? I think we all understand the characters and conflict at this point. Time for the plot twist? Declas, FINALLY?', 'The President of the United States initiated and confirmed the order when he stated ""The Calm Before the Storm.""', 'When was the statement made?', 'When did ""Q"" go active?', 'Watch the News.', 'Watch the FBI.', 'Watch the DOJ.', 'Q', '>>4134775', 'Okay. Is the plot moving forward? I think we all understand the characters and conflict at this point. Time for the plot twist? Declas, FINALLY?']"
2018-06-28,16:33:42,1621.0,"['We remember you, Mr. VIP! https://mobile.twitter.com/Q_ANONBaby/status/1012232994646581248 WWG1WGA! Where did the Storm derive from? Some things leave lasting impressions. Listen carefully. https://m.youtube.com/watch?v=B5T7Gr5oJbM&feature=youtu.be When did POTUS make the statement? When did we arrive to start the awakening? You have more than you know. Fireworks. Q ']"
2017-11-22,01:18:17,179.0,"['>>150398185>>150395774 >What stringer was provided (2) days prior to event? _Conf_D-TT_^_v891_0600_yes _green1_0600 Bunker Apple Yellow Sky [… + 1] confirm 0600 (time) yes Green 0600 (time) Base Green Yellow (condition yellow?) Air >What were the keywords in the stringer? confirm green Yellow Sky >Guide to reading map? legend, past provides the future, questions provide the answers >Lord d R. ++ target >Who was the pilot of the plane? Green >What was countered? Unknown to us >Who was on the ground (outside) shortly before the collision? ""Unnamed"" Rothschild >Who was in the home shortly before the collision? Unknown to us now, was ""dog grooming event"" >Learn to read the map. trying really hard, is like herding kittens in here sometimes >We may have overestimated your ability. you came to us for certain strengths but there are weaknesses as well, some being exploited not enough focus answer the questions build the big picture break it back down make memes for the normies to calm & educate so we\'ll be ready for the StormShadow war. Act II, Scene IV. (Movie idea – thoughts?) (Characters) Good guy (pilot of helicopter). Bad guy (pilot of plane). Targets (on ground and in home). (Story) Upon receipt of the ‘go’ code - Good guy flies during a blackout window provided by unknown agency w/ unknowns (ordinary people by the look of it) to a select location (re: highly classified mission) who was given the \'go\' order by \'x\' to execute (delivery – (3) for care_). Bad guy intercepts message due to rogue operator embedded in tactical observation unit and takes out Good guy by top down invisible attack. Mission failure. Encore: What has since occurred by Targets? Q']"
2017-11-02,18:44:21,55.0,"['Look to Twitter: Exactly this: ""My fellow Americans, the Storm is upon us......."" God bless.']"
