# 1. Scraping qdrops.online with BeautifulSoup and parsing its content
## 1.1 capturing
the text we want is contained within this tag hiearcharchy: 
- div message -> text -> string if not None & p 
- div message -> div op -> string 
- div message -> abbr title, abbr.text
- meta lar -> span: time, name, source, num

### prematurely calling text or get_text() on div_text will render unnecessary text, text we'd later have to clean

### tags to extract():
- hyperlinks: https?\S+\b, www, twitter, instagram, etc (inevitably will have to regex)
- a href
- figure
- figcaption
- images
- div op containing no text or string
- replace punct with a single space, then replace spaces longer than 1 space with a single space
- it also might make things easier tokenizing them before hand
### cleaning 
- sub hyperlinks
- lower text
- split text 
- sub punctuation
- sub digits 
- join split words back into string
- append strings to list if strings 
- return list of cleaned strings 
## Recurring Problems
### inconsistent tag use: br, p, text, abbr 
- many more tags could have been abbreviated or propertied with its value
- pickling exceeds maximum recursion; solved by sys.get and set a higher recursion limit
- runtime of requests is > 1 min: solved by loading the pickled object
- unwanted text from hyperlinks, figcaptions, etc; solved by using BeautifulSoup extract() on unwanted objects

In [1]:
import requests
import nltk
import os, sys
import itertools
import re, string
import pandas as pd
import pickle
import timeit

from string import punctuation, digits
from collections import Counter
from bs4 import BeautifulSoup, NavigableString, Tag
from string import punctuation, digits
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

punctuation += str('’‘–…“”')
pd.set_option('display.max_colwidth', None)

We don't have to execute the following cell, we can skip to opening the pickled object

In [3]:
'''%%time
base_url = 'https://qposts.online/page/' 
urls = [base_url+str(i) for i in range(1, 105)]
page_requests = [requests.get(url) for url in urls]
soups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]
messages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]
messages_flat = list(itertools.chain.from_iterable(messages))
meta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]'''

"%%time\nbase_url = 'https://qposts.online/page/' \nurls = [base_url+str(i) for i in range(1, 105)]\npage_requests = [requests.get(url) for url in urls]\nsoups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]\nmessages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]\nmessages_flat = list(itertools.chain.from_iterable(messages))\nmeta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]"

In [2]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

In [3]:
index_df = pd.read_pickle('index_df.pkl')
index_df.number = index_df.number.astype('int')
date = [index_df.datetime[i].date() for i in range(0, len(index_df.datetime))]
time = [index_df.datetime[i].time() for i in range(0, len(index_df.datetime))]
dt_index = pd.MultiIndex.from_arrays([date, time])

In [4]:
%%time 
with open('messages_flat.pkl', 'rb') as f:
    messages = pickle.load(f)
with open('meta_flat.pkl', 'rb') as f:
    meta_lar = pickle.load(f)

CPU times: user 11.9 s, sys: 358 ms, total: 12.3 s
Wall time: 12.3 s


In [5]:
%%time
with open('names_joined.pkl', 'rb') as f:
    names = pickle.load(f)
with open('sources_joined.pkl', 'rb') as f:
    sources = pickle.load(f)

CPU times: user 1.84 ms, sys: 1.36 ms, total: 3.2 ms
Wall time: 8.14 ms


In [5]:
# if an item in Messages.split(i) is in replace_dict.keys(), replace i (the key) with its value
replace_dict = {' w ': 'with',
 '&': 'and',
 'AUTH': 'authorization',
 'BRENNAN': 'John Brennan',
 'COVID19': 'covid',
 'D': 'Democrats',
 'GOOG': 'Google',
 "Gov't": 'government',
 'HUSSEIN': 'Barack Obama',
 'ID': 'identification',
 'KERRY': 'John Kerry',
 "M's": 'marshalls',
 'MERKEL': 'Angela Merkel',
 'MS-13': 'ms thirteen',
 'MSM': 'mainstream media',
 'No Such Agency': 'NSA',
 'Russia>D': 'Russia Democrats',
 'SA': 'Saudi Arabia',
 'U S Gov t': 'United States government',
 'U.S.': 'United States',
 'US': 'United States',
 '[D]': 'Democratic',
 'comms': 'communications',
 "d's": 'democrats',
 'ds': 'democrats',
 'financial T': 'financial transactions',
 'r v d': 'republicans vs democrats',
 "r's": 'republicans',
 'rs': 'republicans',
 'v': 'versus',
 'v2': 'version two',
 'w/': 'with'}

In [6]:
class Spans:
    def __init__(self):
        pass

    def nums():
        nums = [meta_lar[i].find('span', 'num').get_text() for i in range(0, len(meta_lar))]
        return nums
                
    def sources():
        sources = [meta_lar[i].find('span', 'source').get_text() for i in range(0, len(meta_lar))]
        links = [meta_lar[i].find('span', 'source').contents[-1].get('href') for i in range(0, len(meta_lar))]
        return sources
    
    def names():
        names = [meta_lar[i].find('span', 'name').get_text() for i in range(0, len(meta_lar))]
        return names
                      
    def dates():
        date_list = [meta_lar[i].find('span', 'time').get_text()for i in range(0, len(meta_lar))]
        dt_idx = pd.to_datetime(date_list, origin='unix', unit='s')
        return dt_idx

In [72]:
class Messages(object):
    def __init__(self):
        pass

    def get(integer: int):
        msg_list = []

        for div_images in messages[integer].findAll('div', class_='images'):
            div_images.extract()
        for a_ref in messages[integer].findAll('a', class_='ref'):
            a_ref.extract()
        for a_href in messages[integer].findAll('a', class_='href'):
            a_ref.extract()
        for empty_line in messages[integer].findAll('p', class_='body-line empty'):
            empty_line.extract()
        for br_tag in messages[integer].findAll('br'):
            br_tag.replace_with(' ')

        for item in messages[integer]:
            if isinstance(item, NavigableString) and item.name is None:
                msg_list.append(item)
            if isinstance(item, Tag) and item.name == 'p':
                msg_list.append(item.string)
            if isinstance(item, Tag) and item.name == 'div' and item.attrs == {'class': ['text']}:
                msg_list.append(item.text)
            if isinstance(item, Tag) and item.name == 'abbr':
                msg_list.append(item.text)


        cleaned = [item for item in msg_list if item != ' ' and item is not None]
        cleaned2x = [item for item in cleaned if not item.startswith('https') and not item.startswith('in.') and not item.startswith('www')]
        return cleaned2x

    def dataframe(integer: int):
        df = pd.DataFrame({'type': [type(i) for i in messages[integer]],
                           'name': [i.name for i in messages[integer]],
                           'attrs': [i.attrs for i in messages[integer]],
                           'content': [i for i in messages[integer]]})
        return df

    def get_abbr(integer:int): 
        return Messages.dataframe(integer)[Messages.dataframe(integer).name == 'abbr']
    
    def info(integer:int):
        return index_df[index_df.number == integer]

    def sents(integer: int):
        return nltk.sent_tokenize(' '.join(Messages.get(integer))) 
      
    def joined(integer: int):
        return ' '.join(Messages.get(integer))
        
    def split(integer: int):
        return Messages.joined(integer).split()

### using string with p tags saves us subing hyperlinks
### add a conditional statement on the end to pop None from list

In [8]:
def flatten(integer: int):
    outter_list, cleaned_list = [], []
    for items in Messages.split(integer):
        if items.startswith('https') or items.startswith('in.') or items.startswith('www'):
            continue
        for char in items:
            if char == '/' or char == '-' or char == '][' or char == '>':
                items = re.sub(char, ' ', items)
        else:
            outter_list.append(items)

    for i in nltk.sent_tokenize(' '.join(outter_list)):
        i = re.sub('[%s]' % re.escape(string.punctuation), '', i)
        i = re.sub('[%s]' % re.escape(string.digits), '', i)
        cleaned_list.append(i)
    words = [nltk.word_tokenize(i.strip()) for i in cleaned_list]
    flat = list(itertools.chain.from_iterable(words))
    return flat

In [9]:
def clean(integer: int):
    cleaned = [re.sub('/', ' ', item) for item in Messages.get(integer)]
    cleaned = [re.sub('-', ' ', item) for item in cleaned]
    cleaned = [re.sub('\]\[', ' ', item) for item in cleaned]
    cleaned = [re.sub('[%s]' % re.escape(string.punctuation), '', item) for item in cleaned]
    cleaned = [re.sub('[%s]' % re.escape(string.digits), '', item) for item in cleaned]
    cleaned = [item.strip() for item in cleaned]
    cleaned = [item for item in cleaned if item]
    cleaned = [nltk.word_tokenize(item) for item in cleaned]
    cleaned = [' '.join(item) for item in cleaned]
    return cleaned

In [178]:
Messages.sents(2600)

['WELCOME TO THE DEMOCRAT PARTY.',
 'The Party of threats, violence, intimidation, name-calling, racism, fascism, division, ….. #WALKAWAY #VOTEREPUBLICAN Q']

In [156]:
Messages.get(2602)

['Look HERE [RUSSIA]',
 'DO NOT LOOK HERE [CHINA]',
 'Worth 43 minutes of your time.',
 'FAKE NEWS WILL NEVER REPORT.',
 'Important to understand going forward.',
 'FACTS MATTER.',
 'Q']

In [165]:
[(i.name, i.attrs) for i in messages[2].children]

[('div', {'class': ['text']})]

In [169]:
[i for i in messages[2].children]

[<div class="text"><p>Shall we play a game?</p><p>[N]othing [C]an [S]top [W]hat [I]s [C]oming</p><p>NCSWIC</p><p>https:<em>//</em>www.cisa.gov/safecom/NCSWIC</p><p>Who stepped down today [forced]?</p><p>https:<em>//</em>www.cisa.gov/bryan-s-ware</p><p>More coming?</p><p>Why is this relevant?</p><p>How do you 'show' the public the truth?</p><p>How do you 'safeguard' US elections post-POTUS?</p><p>How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? </p><p>It had to be this way.</p><p>Sometimes you must walk through the darkness before you see the light. </p><p>Q</p></div>]

In [175]:
messages[2].next_element.name, messages[2].next_element.attrs

('div', {'class': ['text']})

In [172]:
[i.string for i in messages[2].div if i.string is not None]

['Shall we play a game?',
 '[N]othing [C]an [S]top [W]hat [I]s [C]oming',
 'NCSWIC',
 'Who stepped down today [forced]?',
 'More coming?',
 'Why is this relevant?',
 "How do you 'show' the public the truth?",
 "How do you 'safeguard' US elections post-POTUS?",
 "How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ",
 'It had to be this way.',
 'Sometimes you must walk through the darkness before you see the light. ',
 'Q']

In [17]:
for i in messages[10].div:
    print(i.name == 'p')

True
True
True
True
True
True
True
True
True
True


In [18]:
[i.name == 'p' for i in messages[10].div]

[True, True, True, True, True, True, True, True, True, True]

In [19]:
[i.name == 'p' for i in messages[10].div.children]

[True, True, True, True, True, True, True, True, True, True]

In [20]:
[i.string for i in messages[10].div if i.string is not None]

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [64]:
[i.string for i in messages[2].div if i.string is not None]

['Shall we play a game?',
 '[N]othing [C]an [S]top [W]hat [I]s [C]oming',
 'NCSWIC',
 'Who stepped down today [forced]?',
 'More coming?',
 'Why is this relevant?',
 "How do you 'show' the public the truth?",
 "How do you 'safeguard' US elections post-POTUS?",
 "How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ",
 'It had to be this way.',
 'Sometimes you must walk through the darkness before you see the light. ',
 'Q']

In [73]:
Messages.dataframe(2)

Unnamed: 0,type,name,attrs,content
0,<class 'bs4.element.Tag'>,div,{'class': ['text']},"[[Shall we play a game?], [[N]othing [C]an [S]top [W]hat [I]s [C]oming], [NCSWIC], [https:, [//], www.cisa.gov/safecom/NCSWIC], [Who stepped down today [forced]?], [https:, [//], www.cisa.gov/bryan-s-ware], [More coming?], [Why is this relevant?], [How do you 'show' the public the truth?], [How do you 'safeguard' US elections post-POTUS?], [How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ], [It had to be this way.], [Sometimes you must walk through the darkness before you see the light. ], [Q]]"


In [76]:
 # dropping rows with the index accessor if their name == 'br
 Messages.dataframe(3005).drop(Messages.dataframe(3005)[Messages.dataframe(3005).name == 'br'].index)

Unnamed: 0,type,name,attrs,content
0,<class 'bs4.element.Tag'>,div,{'class': ['text']},"[https:, [//], in.reuters.com/article/iran-economy-rouhani-sanctions/iran-parliament-censures-rouhani-in-sign-pragmatists-losing-sway-idINKCN1LD0DL, , [Hassan Rouhani], , Who paid HUSSEIN to attend HARVARD LAW SCHOOL?, , Who is Prince Alwaleed bin Talal?, , Why would Prince Alwaleed bin Talal (Saudi Royal) pay HUSSEIN to attend HARVARD LAW SCHOOL?, , Was HUSSEIN a prominent political figure or a person of influence at the time?, , No., , Who is Valerie Jarrett? , , Where was she born?, , When did Valerie Jarrett hire Michelle Robinson?, , 1991, , Timeline., , https:, [//], www.thisisinsider.com/how-did-barack-and-michelle-obama-meet-2017-10#1991-they-got-engaged-in-a-simple-and-sweet-way-3, , Who is Mayor (former) Richard Daley?, , Who is Mayor (current) Rahm Emanuel?, , HUSSEIN should be VERY nervous., , BRENNAN should be VERY nervous., , KERRY should be VERY nervous., , MERKEL should be VERY nervous., , +29, , How were the pallets of cash divided?, , How many planes were used to transport? , , Who operated the planes?, , What 'shadow' agency directed operations?, , Why wasn't the money [simply] wire transferred?, , US had AUTH to open bank-to-bank transfers., , How do you prevent financial T logs?, , How were the cash withdrawals in , [EU], categorized/labeled? , , Where did the cash originate from?, , What time of day did the withdrawals occur? , , Who provided SECURITY?, , Why wasn't Congress notified?, , Why was the U.S. Gov't kept in the DARK?, , US law broken?, , Did ALL planes land in the same location (airport)? , , Why did [1] particular plane land outside of Iran?, , Why was a helicopter involved?, , [WHO] did the money go to?, , HOW DO YOU AUDIT A FOREIGN AID BIG BLOCK TRANSFER?, , Did Rouhani keep 'unknown' comms as insurance?, , What agency collects ALL FORMS OF DATA?, , What agency did , [@Snowden], work for orig?, , Did he train on THE FARM?, , When did , [@Snowden], join No Such Agency?, , Define 'Contractor'., , ...]"


In [77]:
[p.string for p in messages[1].div.contents if p.string is not None]

['Durham.', 'Q']

In [78]:
messages[1].attrs == {'class': ['message']}

True

In [121]:
if messages[3005].attrs == {'class': ['message']}:
    for i in messages[3005]:
        print(nltk.sent_tokenize(i.text))

['https://in.reuters.com/article/iran-economy-rouhani-sanctions/iran-parliament-censures-rouhani-in-sign-pragmatists-losing-sway-idINKCN1LD0DL [Hassan Rouhani] Who paid HUSSEIN to attend HARVARD LAW SCHOOL?', 'Who is Prince Alwaleed bin Talal?', 'Why would Prince Alwaleed bin Talal (Saudi Royal) pay HUSSEIN to attend HARVARD LAW SCHOOL?', 'Was HUSSEIN a prominent political figure or a person of influence at the time?', 'No.', 'Who is Valerie Jarrett?', 'Where was she born?', 'When did Valerie Jarrett hire Michelle Robinson?', '1991 Timeline.', 'https://www.thisisinsider.com/how-did-barack-and-michelle-obama-meet-2017-10#1991-they-got-engaged-in-a-simple-and-sweet-way-3 Who is Mayor (former) Richard Daley?', 'Who is Mayor (current) Rahm Emanuel?', 'HUSSEIN should be VERY nervous.', 'BRENNAN should be VERY nervous.', 'KERRY should be VERY nervous.', 'MERKEL should be VERY nervous.', '+29 How were the pallets of cash divided?', 'How many planes were used to transport?', 'Who operated the 

In [122]:
Messages.get(3005)

[]

In [127]:
messages[3005].name == 'div' and messages[3005].attrs == {'class': ['message']}

True

In [128]:
messages[0].name == 'div' and messages[0].attrs == {'class': ['message']}

True

In [143]:
[i.attrs for i in messages[0].children]

[{'class': ['text']}]

In [79]:
clean(4800)

['For the coming days ahead Ask yourself an honest question why would a billionaire who has it all fame fortune a warm and loving family friends etc want to endanger himself and his family by becoming POTUS Why would he want to target himself and those he cares about Does he need money Does he need fame What does he get out of this Does he want to make the US world a better place for his family and for those good and decent people who have long been taken advantage of Perhaps he could not stomach the thought of mass murders occurring to satisfy Moloch Perhaps he could not stomach the thought of children being kidnapped drugged and raped while leaders law enforcement of the world turn a blind eye Perhaps he was tired of seeing how certain races countries were being constantly abused and kept in need poor and suffering all for a specific purpose Perhaps he could not in good conscious see the world burn Why hours after the election did seven people travel to an undisclosed location to hol

In [45]:
index_list = [i for i,e in enumerate(Messages.split(4800)) if e in replace_dict.keys()]

In [47]:
[e for i,e in enumerate(Messages.split(4800)) if i in index_list]

['&', 'MSM', 'ID', 'SA']

In [50]:
index_df[index_df.number == 4000]

Unnamed: 0,number,datetime,name,source
953,4000,2020-04-29 00:58:18,Q !!Hs1Jq13jV6,8kun/qresearch8953725


In [144]:
[i for i in Messages.sents(3009) if not i.startswith('https')]

['FISA (ABCs) v INSCOM NOFORN NSA INSCOM BRIDGE FISA = FISC Who is accountable to Congress (civilian body)?',
 "Define 'State Secrets'.",
 'Process of obtaining a FISA warrant?',
 'What must be DEMONSTRATED to be GRANTED?',
 'Who must SIGN OFF?',
 "Can 'select' individuals in senior positions of power be SHADOW BANNED from ACTIVE FISA WARRANTS / SURV?",
 'NAT SEC Sufficient evidence shown to demonstrate rogue elements of intelligence apparatus illegally violated FISA law (tenets) in coordinated effort w/ d+foreign allies to impact/mod the outcome of the 2016 Presidential election & safeguard against future uncover / criminal prosecution?',
 'Who must sign off on FISA warrants?',
 'Who directs the signers?',
 'Given magnitude of spy campaign (U.S. Presidential Election Republican Party Nominee Candidate + President Elect + President of the United States) would HUSSEIN be required to DIRECT ORDER?',
 'How would updates occur?',
 'PDB?',
 'Who has access to the PDB?',
 'On-site CLAS brief

In [145]:
'ID' in replace_dict.keys()

True

In [321]:
# sorted dict by creating a dict comprehension
{k:v for k,v in zip(sorted(replace_dict.keys()), [replace_dict.get(i) for i in sorted(replace_dict.keys())])}

{' w ': 'with',
 '&': 'and',
 'AUTH': 'authorization',
 'BRENNAN': 'John Brennan',
 'COVID19': 'covid',
 'D': 'Democrats',
 'GOOG': 'Google',
 "Gov't": 'government',
 'HUSSEIN': 'Barack Obama',
 'ID': 'identification',
 'KERRY': 'John Kerry',
 "M's": 'marshalls',
 'MERKEL': 'Angela Merkel',
 'MS-13': 'ms thirteen',
 'MSM': 'mainstream media',
 'No Such Agency': 'NSA',
 'Russia>D': 'Russia Democrats',
 'SA': 'Saudi Arabia',
 'U S Gov t': 'United States government',
 'U.S.': 'United States',
 'US': 'United States',
 '[D]': 'Democratic',
 'comms': 'communications',
 "d's": 'democrats',
 'ds': 'democrats',
 'financial T': 'financial transactions',
 'r v d': 'republicans vs democrats',
 "r's": 'republicans',
 'rs': 'republicans',
 'v': 'versus',
 'v2': 'version two',
 'w/': 'with'}

In [146]:
messages[-1].find('div', class_='op').text

'>>146981635 Hillary Clinton will be arrested between 7:45 AM - 8:30 AM EST on Monday - the morning on Oct 30, 2017.'

In [88]:
messages[-4].contents

[<div class="text">Some of us come here to drop crumbs, just crumbs.<br/><abbr title="President of the United States">POTUS</abbr> is 100% insulated - any discussion suggesting he’s even a target is false.<br/><abbr title="President of the United States">POTUS</abbr> will not be addressing nation on any of these issues as people begin to be indicted and must remain neutral for pure optical reasons. To suggest this is the plan is false and should be common sense.<br/>Focus on Military Intellingence/ State Secrets and why might that be used vs any three letter agency <br/>What <abbr title="Supreme Court, Special Counsel">SC</abbr> decision opened the door for a sitting President to activate - what must be showed?<br/>Why is <abbr title="President of the United States">POTUS</abbr> surrounded by generals ^^<br/>Again, there are a lot more good people than bad so have faith. This was a hostile takeover from an evil corrupt network of players (not just Democrats).<br/>Don’t fool yourself in

In [179]:
Messages.get(2601)

['Justice K >>> Highest Court in the Land.',
 'Law & Order [majority] [U.S. Constitution] safeguarded. ',
 'IT WAS OUR LAST CHANCE TO SAVE IT [Non-Force]',
 'Now comes the real PAIN.',
 'Now comes the real TRUTH.',
 'BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 'They want you DIVIDED.',
 'DIVIDED by RACE.',
 'DIVIDED by RELIGION.',
 'DIVIDED by CULTURE.',
 'DIVIDED by CLASS.',
 'DIVIDED by POLITICAL AFFILIATION. ',
 'DIVIDED YOU ARE WEAK.',
 'TOGETHER YOU ARE STRONG.',
 'This movement challenges their ‘forced’ narrative. ',
 'This movement challenges people to not simply trust what is being reported.',
 'Research for yourself.',
 'Think for yourself.',
 'Trust yourself.',
 'This movement is not about one person o

In [180]:
clean(2600)

['WELCOME TO THE DEMOCRAT PARTY',
 'The Party of threats violence intimidation name calling racism fascism division …',
 'WALKAWAY',
 'VOTEREPUBLICAN',
 'Q']

In [150]:
clean(2)

['Shall we play a gameNothing Can Stop What Is ComingNCSWIChttps wwwcisagov safecom NCSWICWho stepped down today forcedhttps wwwcisagov bryan s wareMore comingWhy is this relevantHow do you show the public the truthHow do you safeguard US elections post POTUSHow do you remove foreign interference and corruption and install US owned voter ID laws and other safeguards It had to be this waySometimes you must walk through the darkness before you see the light Q']

In [461]:
[i.string for i in messages[2600]]

['WELCOME TO THE DEMOCRAT PARTY.',
 'The Party of threats, violence, intimidation, name-calling, racism, fascism, division, …..',
 '#WALKAWAY',
 '#VOTEREPUBLICAN',
 'Q']

In [181]:
Messages.get(2)

["Shall we play a game?[N]othing [C]an [S]top [W]hat [I]s [C]omingNCSWIChttps://www.cisa.gov/safecom/NCSWICWho stepped down today [forced]?https://www.cisa.gov/bryan-s-wareMore coming?Why is this relevant?How do you 'show' the public the truth?How do you 'safeguard' US elections post-POTUS?How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? It had to be this way.Sometimes you must walk through the darkness before you see the light. Q"]

In [182]:
[i.text for i in messages[1].div.contents]

['Durham.', 'Q']

In [100]:
[p.string for p in messages[0].div.contents]

[None, 'Q']

In [102]:
len(messages[4].div.contents)

4

In [103]:
[p.string for p in messages[2].div.contents if p.string is not None]

['Shall we play a game?',
 '[N]othing [C]an [S]top [W]hat [I]s [C]oming',
 'NCSWIC',
 'Who stepped down today [forced]?',
 'More coming?',
 'Why is this relevant?',
 "How do you 'show' the public the truth?",
 "How do you 'safeguard' US elections post-POTUS?",
 "How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ",
 'It had to be this way.',
 'Sometimes you must walk through the darkness before you see the light. ',
 'Q']

In [571]:
[i for i in messages[2603].contents]

[<div class="images"><figure><figcaption>78c1a07f-8252-4e76-bb81-8bbd003aae79.jpg</figcaption></figure><figure><figcaption>Whidbey-Island-missile-Mystery-streak-of-light-is-spotted-over-Whidbey.jpg</figcaption></figure></div>,
 <p><div class="op" id="op972"><div class="images"><figure><figcaption>DVx_4xgVoAAYvkH.jpg</figcaption></figure></div><p>&gt;&gt;343</p><p></p><p>And so it begins..</p><p>How do you start a WAR?</p><p>[Markers] matter.</p><p>Playbook [FAIL].</p><p>Q</p></div></p>,
 <p>Post Justice K confirmation.</p>,
 <p>RUSSIA RUSSIA RUSSIA</p>,
 <p>Think <abbr title="United Kingdom">UK</abbr> / <abbr title="Australia">AUS</abbr> &gt;&gt; RUSSIA RUSSIA RUSSIA</p>,
 <p>DEFLECT DECLAS </p>,
 <p>DEFLECT BLAME</p>,
 <p>[FOCUS RUSSIA &gt;&gt;&gt; GLOBAL ENEMY]</p>,
 <p>"The Hunt For" dropped - why?</p>,
 <p>When was the unauthorized missile fired?</p>,
 <p>Was it found/discovered prior to [Hunt]?</p>,
 <p>RUSSIA SUB THREAT PUSH?</p>,
 <p>RED_OCTOBER</p>,
 <p>Double meanings exist.</

In [540]:
[i.text for i in messages[2602].contents]

['Look HERE [RUSSIA]',
 'DO NOT LOOK HERE [CHINA]',
 'https://www.youtube.com/watch?v=aeVrMniBjSc',
 'Worth 43 minutes of your time.',
 'FAKE NEWS WILL NEVER REPORT.',
 'Important to understand going forward.',
 'FACTS MATTER.',
 'Q']

In [104]:
messages[2602].name, messages[2602].attrs

('div', {'class': ['message']})

In [105]:
messages[-1].contents[0].attrs

{'class': ['text']}

In [106]:
Messages.get(2603)

['Post Justice K confirmation.',
 'RUSSIA RUSSIA RUSSIA',
 'DEFLECT DECLAS ',
 'DEFLECT BLAME',
 '[FOCUS RUSSIA >>> GLOBAL ENEMY]',
 '"The Hunt For" dropped - why?',
 'When was the unauthorized missile fired?',
 'Was it found/discovered prior to [Hunt]?',
 'RUSSIA SUB THREAT PUSH?',
 'RED_OCTOBER',
 'Double meanings exist.',
 '……………..',
 'Q']

In [593]:
[i.text for i in messages[2603].find('div').contents]

['>>343',
 '',
 'And so it begins..',
 'How do you start a WAR?',
 '[Markers] matter.',
 'Playbook [FAIL].',
 'Q']

In [107]:
messages[2603].find('div').attrs

{'id': 'op972', 'class': ['op']}

In [218]:
# this loop gets both the div op contents and the contents of the message which aren't in div op
if messages[2603].attrs == {'class': ['message']}:
    for p in messages[2603].contents:
        for div_op in p.findAll('div', class_='op'):
            for item in div_op.contents:
                print(item.string)
        if p.string is not None:
            print(p.string)

>>343
https://twitter.com/CNNPolitics/status/1048324715469783040
And so it begins..
How do you start a WAR?
[Markers] matter.
Playbook [FAIL].
Q
Post Justice K confirmation.
RUSSIA RUSSIA RUSSIA
DEFLECT DECLAS 
DEFLECT BLAME
[FOCUS RUSSIA >>> GLOBAL ENEMY]
"The Hunt For" dropped - why?
When was the unauthorized missile fired?
Was it found/discovered prior to [Hunt]?
RUSSIA SUB THREAT PUSH?
RED_OCTOBER
Double meanings exist.
……………..
Q


In [109]:
messages[2603].find('div', class_=['op'])

<div class="op" id="op972"><p>&gt;&gt;343</p><p><a href="https://twitter.com/CNNPolitics/status/1048324715469783040" rel="nofollow" target="_blank">https://twitter.com/CNNPolitics/status/1048324715469783040</a></p><p>And so it begins..</p><p>How do you start a WAR?</p><p>[Markers] matter.</p><p>Playbook [FAIL].</p><p>Q</p></div>

In [608]:
# if messages[i].attrs == {'class': ['op']}
messages[2603].attrs == {'class': ['op']}

True

In [111]:
[p.string for p in messages[2603].p.div.contents]

['>>343',
 'https://twitter.com/CNNPolitics/status/1048324715469783040',
 'And so it begins..',
 'How do you start a WAR?',
 '[Markers] matter.',
 'Playbook [FAIL].',
 'Q']

In [221]:
if messages[2601].attrs == {'class': ['message']}:
    for p in messages[2601].contents:
        for div_op in p.findAll('div', class_='op'):
            for item in div_op.contents:
                print(item.string)
        if p.string is not None:
            print(p.string)

Justice K >>> Highest Court in the Land.
Law & Order [majority] [U.S. Constitution] safeguarded. 
IT WAS OUR LAST CHANCE TO SAVE IT [Non-Force]
https://www.youtube.com/watch?v=G2qIXXafxCQ
Now comes the real PAIN.
Now comes the real TRUTH.
BOOM
        BOOM
                BOOM
                        BOOM
They want you DIVIDED.
DIVIDED by RACE.
DIVIDED by RELIGION.
DIVIDED by CULTURE.
DIVIDED by CLASS.
DIVIDED by POLITICAL AFFILIATION. 
DIVIDED YOU ARE WEAK.
TOGETHER YOU ARE STRONG.
This movement challenges their ‘forced’ narrative. 
This movement challenges people to not simply trust what is being reported.
Research for yourself.
Think for yourself.
Trust yourself.
This movement is not about one person or a group of people.
WE, the PEOPLE.
Save the Republic!
Hatred and Dissension in the Nation will Heal.
WHERE WE GO ONE, WE GO ALL.
Q


In [222]:
Messages.info(2601)

Unnamed: 0,number,datetime,name,source
2352,2601,2018-12-12 22:22:57,Q !!mG7VJxZNCI,8ch/qresearch


In [301]:
def alt_get(integer: int):
    inner = []
    if messages[integer].attrs == {'class': ['message']}:
        for p in messages[integer].contents:
            for div_op in p.findAll('div', class_='op'):
                for item in div_op.contents:
                    inner.append(item.string)
            if p.string is not None:
                inner.append(p.string)

    if messages[integer].name == 'div' and messages[integer].next_element.attrs == {'class': ['text']}:
        for p_tag in messages[integer].div.findAll('p'):
            if p_tag.string is not None:
                inner.append(p_tag.string)
    
    if messages[integer].next_element.attrs == {'class': ['text']}:
        if 'p' not in [i.name for i in messages[integer].next_element.children]:
            inner.append(messages[integer].text)

    return inner

In [307]:
[i.name for i in messages[-1].next_element.children]

['div', None, 'abbr', None, 'abbr', None, 'abbr', None]

In [303]:
alt_get(-1)

['>>146981635',
 ' ',
 'Hillary Clinton will be arrested between 7:45 AM - 8:30 AM EST on Monday - the morning on Oct 30, 2017.',
 '>>146981635 Hillary Clinton will be arrested between 7:45 AM - 8:30 AM EST on Monday - the morning on Oct 30, 2017. HRC extradition already in motion effective yesterday with several countries in case of cross border run. Passport approved to be flagged effective 10/30 @ 12:01am. Expect massive riots organized in defiance and others fleeing the US to occur. US M’s will conduct the operation while NG activated. Proof check: Locate a NG member and ask if activated for duty 10/30 across most major cities.']

In [302]:
alt_get(2601)

['Justice K >>> Highest Court in the Land.',
 'Law & Order [majority] [U.S. Constitution] safeguarded. ',
 'IT WAS OUR LAST CHANCE TO SAVE IT [Non-Force]',
 'https://www.youtube.com/watch?v=G2qIXXafxCQ',
 'Now comes the real PAIN.',
 'Now comes the real TRUTH.',
 'BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 'They want you DIVIDED.',
 'DIVIDED by RACE.',
 'DIVIDED by RELIGION.',
 'DIVIDED by CULTURE.',
 'DIVIDED by CLASS.',
 'DIVIDED by POLITICAL AFFILIATION. ',
 'DIVIDED YOU ARE WEAK.',
 'TOGETHER YOU ARE STRONG.',
 'This movement challenges their ‘forced’ narrative. ',
 'This movement challenges people to not simply trust what is being reported.',
 'Research for yourself.',
 'Think for yourself.',
 'Trust yours

In [250]:
messages[2].name == 'div' and messages[2].next_element.attrs == {'class': ['text']}

True

In [269]:
[p.string for p in messages[-1].div if p.string is not None]

[' ',
 'HRC',
 ' extradition already in motion effective yesterday with several countries in case of cross border run. Passport approved to be flagged effective 10/30 @ 12:01am. Expect massive riots organized in defiance and others fleeing the US to occur. US M’s will conduct the operation while ',
 'NG',
 ' activated. Proof check: Locate a ',
 'NG',
 ' member and ask if activated for duty 10/30 across most major cities.']

In [277]:
'p' in [i.name for i in messages[-1].next_element.children]

False

In [310]:
if messages[2600].next_element.attrs == {'class': ['text']}:
    if 'p' in [i.name for i in messages[-1].next_element.children]:
        print(messages[-1].text)
    if 'p' not in [i.name for i in messages[-1].next_element.children]:
        print(messages[-1].text)

In [331]:
if messages[2600].attrs == {'class': ['message']}:
    if 'p' in [i.name for i in messages[2600].children]:
        for p_tag in messages[2600].findAll('p'):
            print(p_tag.string)

WELCOME TO THE DEMOCRAT PARTY.
The Party of threats, violence, intimidation, name-calling, racism, fascism, division, …..
#WALKAWAY
#VOTEREPUBLICAN
Q
