# 1. Scraping qdrops.online with BeautifulSoup and parsing its content
## 1.1 capturing
the text we want is contained within this tag hiearcharchy: 
- div message -> text -> string if not None & p 
- div message -> div op -> string 
- div message -> abbr title, abbr.text
- meta lar -> span: time, name, source, num

### prematurely calling text or get_text() on div_text will render unnecessary text, text we'd later have to clean

### tags to extract():
- hyperlinks: https?\S+\b, www, twitter, instagram, etc (inevitably will have to regex)
- a href
- figure
- figcaption
- images
- div op containing no text or string
- replace punct with a single space, then replace spaces longer than 1 space with a single space
- it also might make things easier tokenizing them before hand
### cleaning 
- sub hyperlinks
- lower text
- split text 
- sub punctuation
- sub digits 
- join split words back into string
- append strings to list if strings 
- return list of cleaned strings 
## Recurring Problems
### inconsistent tag use: br, p, text, abbr 
- many more tags could have been abbreviated or propertied with its value
- pickling exceeds maximum recursion; solved by sys.get and set a higher recursion limit
- runtime of requests is > 1 min: solved by loading the pickled object
- unwanted text from hyperlinks, figcaptions, etc; solved by using BeautifulSoup extract() on unwanted objects

In [1]:
import requests
import nltk
import os, sys
import itertools
import re, string
import pandas as pd
import pickle

from string import punctuation, digits
from collections import Counter
from bs4 import BeautifulSoup, NavigableString, Tag
from string import punctuation, digits
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

punctuation += str('’‘–…“”')
pd.set_option('display.max_colwidth', None)

We don't have to execute the following cell, we can skip to opening the pickled object

In [3]:
'''%%time
base_url = 'https://qposts.online/page/' 
urls = [base_url+str(i) for i in range(1, 105)]
page_requests = [requests.get(url) for url in urls]
soups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]
messages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]
messages_flat = list(itertools.chain.from_iterable(messages))
meta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]'''

"%%time\nbase_url = 'https://qposts.online/page/' \nurls = [base_url+str(i) for i in range(1, 105)]\npage_requests = [requests.get(url) for url in urls]\nsoups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]\nmessages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]\nmessages_flat = list(itertools.chain.from_iterable(messages))\nmeta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]"

In [2]:
index_df = pd.read_pickle('index_df.pkl')
index_df.number = index_df.number.astype('int')
date = [index_df.datetime[i].date() for i in range(0, len(index_df.datetime))]
time = [index_df.datetime[i].time() for i in range(0, len(index_df.datetime))]
dt_index = pd.MultiIndex.from_arrays([date, time])

In [3]:
%%time 
with open('messages_flat.pkl', 'rb') as f:
    messages = pickle.load(f)
with open('meta_flat.pkl', 'rb') as f:
    meta_lar = pickle.load(f)

CPU times: user 12 s, sys: 400 ms, total: 12.4 s
Wall time: 12.5 s


In [4]:
%%time
with open('names_joined.pkl', 'rb') as f:
    names = pickle.load(f)
with open('sources_joined.pkl', 'rb') as f:
    sources = pickle.load(f)
with open('flat_df_list.pkl', 'rb') as f:
    flat_df_list = pickle.load(f)

CPU times: user 6.95 s, sys: 150 ms, total: 7.1 s
Wall time: 7.13 s


In [5]:
for message in messages:
    for a_ref in message.findAll('a', class_='ref'):
        a_ref.extract()
    for images in message.findAll('div', class_='images'):
        images.extract()
    for br_tags in message.findAll('br'):
        br_tags.extract()

In [6]:
# if an item in Messages.split(i) is in replace_dict.keys(), replace i (the key) with its value
replace_dict = {' w ': 'with',
 '&': 'and',
 'AUTH': 'authorization',
 'BRENNAN': 'John Brennan',
 'COVID19': 'covid',
 'D': 'Democrats',
 'GOOG': 'Google',
 "Gov't": 'government',
 'HUSSEIN': 'Barack Obama',
 'ID': 'identification',
 'KERRY': 'John Kerry',
 "M's": 'marshalls',
 'MERKEL': 'Angela Merkel',
 'MS-13': 'ms thirteen',
 'MSM': 'mainstream media',
 'No Such Agency': 'NSA',
 'Russia>D': 'Russia Democrats',
 'SA': 'Saudi Arabia',
 'U S Gov t': 'United States government',
 'U.S.': 'United States',
 'US': 'United States',
 '[D]': 'Democratic',
 'comms': 'communications',
 "d's": 'democrats',
 'ds': 'democrats',
 'financial T': 'financial transactions',
 'r v d': 'republicans vs democrats',
 "r's": 'republicans',
 'rs': 'republicans',
 'v': 'versus',
 'v2': 'version two',
 'w/': 'with'}

In [7]:
class Spans:
    def __init__(self):
        pass

    def nums():
        nums = [meta_lar[i].find('span', 'num').get_text() for i in range(0, len(meta_lar))]
        return nums
                
    def sources():
        sources = [meta_lar[i].find('span', 'source').get_text() for i in range(0, len(meta_lar))]
        links = [meta_lar[i].find('span', 'source').contents[-1].get('href') for i in range(0, len(meta_lar))]
        return sources
    
    def names():
        names = [meta_lar[i].find('span', 'name').get_text() for i in range(0, len(meta_lar))]
        return names
                      
    def dates():
        date_list = [meta_lar[i].find('span', 'time').get_text()for i in range(0, len(meta_lar))]
        dt_idx = pd.to_datetime(date_list, origin='unix', unit='s')
        return dt_idx

In [8]:
class Messages(object):
    def __init__(self):
        pass

    def get(integer: int):
        msg_list = []

        for div_images in messages[integer].findAll('div', class_='images'):
            div_images.extract()
        for a_ref in messages[integer].findAll('a', class_='ref'):
            a_ref.extract()
        for a_href in messages[integer].findAll('a', class_='href'):
            a_ref.extract()
        for empty_line in messages[integer].findAll('p', class_='body-line empty'):
            empty_line.extract()
        for br_tag in messages[integer].findAll('br'):
            br_tag.replace_with(' ')

        for item in messages[integer]:
            if isinstance(item, NavigableString) and item.name is None:
                msg_list.append(item)
            if isinstance(item, Tag) and item.name == 'p':
                msg_list.append(item.string)
            if isinstance(item, Tag) and item.name == 'div' and item.attrs == {'class': ['text']}:
                msg_list.append(item.text)
            if isinstance(item, Tag) and item.name == 'abbr':
                msg_list.append(item.text)


        cleaned = [item for item in msg_list if item != ' ' and item is not None]
        cleaned2x = [item for item in cleaned if not item.startswith('https') and not item.startswith('in.') and not item.startswith('www')]
        return cleaned2x

    def dataframe(integer: int):
        df = pd.DataFrame({'type': [type(i) for i in messages[integer]],
                           'name': [i.name for i in messages[integer]],
                           'attrs': [i.attrs for i in messages[integer]],
                           'content': [i for i in messages[integer]]})
        return df

    def get_abbr(integer:int): 
        return Messages.dataframe(integer)[Messages.dataframe(integer).name == 'abbr']
    
    def info(integer:int):
        return index_df[index_df.number == integer]

    def sents(integer: int):
        return nltk.sent_tokenize(' '.join(Messages.get(integer))) 
      
    def joined(integer: int):
        return ' '.join(Messages.get(integer))
        
    def split(integer: int):
        return Messages.joined(integer).split()
    
    def children(integer: int): 
        name = messages[integer].div.name
        attr = messages[integer].div.attrs
        parent = name, attr
        children = [i.name for i in messages[integer].div.children]
        return parent, children

In [12]:
'''def clean(integer: str):
    cleaned = [re.sub('/', ' ', item) for item in integer]
    cleaned = [re.sub('-', ' ', item) for item in cleaned]
    cleaned = [re.sub('\]\[', ' ', item) for item in cleaned]
    cleaned = [re.sub('[%s]' % re.escape(string.punctuation), '', item) for item in cleaned]
    cleaned = [re.sub('[%s]' % re.escape(string.digits), '', item) for item in cleaned]
    cleaned = [item.strip() for item in cleaned]
    cleaned = [item for item in cleaned if item]
    #cleaned = [nltk.word_tokenize(item) for item in cleaned]
    #cleaned = [' '.join(item) for item in cleaned]
    return cleaned'''

In [13]:
'''def get(integer: int):
    inner_list = []
    if messages[integer].attrs == {'class': ['message']}:
     
        for item in messages[integer].contents:
       
            for div_op in item.findAll('div', class_='op'):
                for itm in div_op.contents:
                    if itm not in inner_list:
                        inner_list.append(itm.string)
            for abbr_tag in item.findAll('abbr'):
                abbr_tag.replace_with(abbr_tag.text)
        if messages[integer].div.name == 'div' and messages[integer].div.attrs == {'class': ['text']}:
            if messages[integer].div.next_element.name == 'p':
                for p_tag in messages[integer].div.findAll('p'):
                    if p_tag.string is not None:
                        inner_list.append(p_tag.string)
            
            else:
                for content in item.contents:
                    if isinstance(content, NavigableString):
                        if content in inner_list:
                            continue
                        else:
                            inner_list.append(content)

    return inner_list'''              

In [9]:
def alt_get(integer: int):
    inner = []
    for items in messages[integer]:
        # creating this exception made the function iterable
        if isinstance(items, NavigableString):
            continue
        else:
            for div_op in items.findAll('div', class_='op'):
                for ops_content in div_op.contents:  
                    inner.append(ops_content.string)
            else:
                if items.string is not None:
                    inner.append(items.string)
                else:
                    for item_content in items.contents:
                        if item_content.string is not None:
                            inner.append(item_content.string)
    cleaned = [item for item in inner if item and item != ' ']
    return cleaned 

In [56]:
def clean_string(integer: int):
    inner = []
    for line in alt_get(integer):
        if line.startswith('https'):
            continue
        else:
            cleaned = re.sub('/', ' ', line)
            cleaned = re.sub('-', ' ', cleaned)
            cleaned = re.sub('_', ' ', cleaned)
            cleaned = re.sub('\]\[', ' ', cleaned)
            cleaned = re.sub('…', '', cleaned)
            cleaned = re.sub('[%s]' % re.escape(string.punctuation), '', cleaned)
            cleaned = re.sub('[%s]' % re.escape(string.digits), '', cleaned)
            inner.append(cleaned.strip())
    stripped = [item for item in inner if item]
    words = [nltk.word_tokenize(word) for word in stripped]
    joined = [' '.join(inner_list) for inner_list in words]
    return joined

In [54]:
clean_string(2624)

['DOITQ',
 'honored at the you Q we are at the ready and the anon morale is very very high lets do this',
 'UNITED WE STAND',
 'STRONG TOGETHER',
 'We are honored',
 'ANONS are selfless nameless faceless fameless and FEARLESS',
 'We are grateful',
 'Q']

In [63]:
flat_cleaned_list = list(itertools.chain.from_iterable(pd.DataFrame({'cleaned_strings': [clean_string(i) for i in range(0, len(messages))]})['cleaned_strings'].tolist()))

In [64]:
with open('flat_cleaned_list.pkl', 'wb') as f:
    pickle.dump(flat_cleaned_list, f, pickle.HIGHEST_PROTOCOL)

In [41]:
Messages.info(2607)

Unnamed: 0,number,datetime,name,source
2344,2609,2018-12-12 23:43:49,Q !!mG7VJxZNCI,8ch/qresearch


# as string 

In [22]:
' '.join([' '.join(nested_list) for nested_list in clean_string(4952)])

'Hillary Clinton will be arrested between AM AM EST on Monday the morning on Oct HRC extradition already in motion effective yesterday with several countries in case of cross border run Passport approved to be flagged effective am Expect massive riots organized in defiance and others fleeing the US to occur US M ’ s will conduct the operation while NG activated Proof check Locate a NG member and ask if activated for duty across most major cities'

In [536]:
for items in messages[2603]:
    for div_op in items.findAll('div', class_='op'):
        for ops_content in div_op.contents:  
            print(ops_content.string)
    else:
        if items.string is not None:
            print(items.string)

>>343
https://twitter.com/CNNPolitics/status/1048324715469783040
And so it begins..
How do you start a WAR?
[Markers] matter.
Playbook [FAIL].
Q
Post Justice K confirmation.
RUSSIA RUSSIA RUSSIA
DEFLECT DECLAS 
DEFLECT BLAME
[FOCUS RUSSIA >>> GLOBAL ENEMY]
"The Hunt For" dropped - why?
When was the unauthorized missile fired?
Was it found/discovered prior to [Hunt]?
RUSSIA SUB THREAT PUSH?
RED_OCTOBER
Double meanings exist.
……………..
Q


In [63]:
#with open('flat_df_list.pkl', 'wb') as f:
#    pickle.dump(flat_df['drop_contents'].tolist(), f, pickle.HIGHEST_PROTOCOL)

In [64]:
%%time 
with open('flat_df_list.pkl', 'rb') as f:
    flat_df_list = pickle.load(f)

CPU times: user 5.96 s, sys: 150 ms, total: 6.11 s
Wall time: 6.13 s


In [28]:
Messages.get(3000)

['>>2770076Double meanings exist.QStay LOCAL (U.S.)GLOBAL = reflection of LOCAL.Think MIRROR.Know your enemy."Every battle is won before it\'s ever fought."Knowledge is POWER.Q']

In [17]:
# implemention if there are p strings in div if div == message or div == text
[i.string for i in messages[10].div.children if i.string is not None]

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [18]:
[i.name == 'p' for i in messages[10].div]

[True, True, True, True, True, True, True, True, True, True]

In [19]:
[i.name == 'p' for i in messages[10].div.children]

[True, True, True, True, True, True, True, True, True, True]

In [59]:
[i.string for i in messages[3600].div if i.string is not None]

['>>1393269', 'Well done, ', 'Anon', '.', 'Q']

In [450]:
[i for i in messages[3600].div.contents]

[<div class="op" id="op235">https:<em>//</em>nationalsecurityaction.org/who-we-are/We are Americans—former senior officials and policy experts, academics and civil society leaders—who have seen first-hand how the United States is stronger, safer and more respected in the world when we stand strong with our allies, pursue principled diplomacy, and stay true to the values that have long defined America at home and abroad.</div>,
 'Well done, ',
 'Anon',
 '.',
 'Q']

In [67]:
for div_op in messages[3600].findAll('div', class_='op'):
    for images in div_op.findAll('div', class_='images'):
        images.extract()

In [69]:
for a_ref in messages[3600].findAll('a', 'ref'):
    a_ref.extract()

In [451]:
[i.string for i in messages[3600].div.div.contents]

['https:',
 '//',
 'nationalsecurityaction.org/who-we-are/',
 'We are Americans—former senior officials and policy experts, academics and civil society leaders—who have seen first-hand how the United States is stronger, safer and more respected in the world when we stand strong with our allies, pursue principled diplomacy, and stay true to the values that have long defined America at home and abroad.']

In [452]:
[i.string for i in messages[2].div if i.string is not None]

['Shall we play a game?',
 '[N]othing [C]an [S]top [W]hat [I]s [C]oming',
 'NCSWIC',
 'Who stepped down today [forced]?',
 'More coming?',
 'Why is this relevant?',
 "How do you 'show' the public the truth?",
 "How do you 'safeguard' US elections post-POTUS?",
 "How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ",
 'It had to be this way.',
 'Sometimes you must walk through the darkness before you see the light. ',
 'Q']

In [453]:
[i.text for i in messages[3005].contents]

["https://in.reuters.com/article/iran-economy-rouhani-sanctions/iran-parliament-censures-rouhani-in-sign-pragmatists-losing-sway-idINKCN1LD0DL [Hassan Rouhani] Who paid HUSSEIN to attend HARVARD LAW SCHOOL? Who is Prince Alwaleed bin Talal? Why would Prince Alwaleed bin Talal (Saudi Royal) pay HUSSEIN to attend HARVARD LAW SCHOOL? Was HUSSEIN a prominent political figure or a person of influence at the time? No. Who is Valerie Jarrett?  Where was she born? When did Valerie Jarrett hire Michelle Robinson? 1991 Timeline. https://www.thisisinsider.com/how-did-barack-and-michelle-obama-meet-2017-10#1991-they-got-engaged-in-a-simple-and-sweet-way-3 Who is Mayor (former) Richard Daley? Who is Mayor (current) Rahm Emanuel? HUSSEIN should be VERY nervous. BRENNAN should be VERY nervous. KERRY should be VERY nervous. MERKEL should be VERY nervous. +29 How were the pallets of cash divided? How many planes were used to transport?  Who operated the planes? What 'shadow' agency directed operations?

In [49]:
[i for i in Messages.sents(3009) if not i.startswith('https')]

['FISA (ABCs) v INSCOM NOFORN NSA INSCOM BRIDGE FISA = FISC Who is accountable to Congress (civilian body)?',
 "Define 'State Secrets'.",
 'Process of obtaining a FISA warrant?',
 'What must be DEMONSTRATED to be GRANTED?',
 'Who must SIGN OFF?',
 "Can 'select' individuals in senior positions of power be SHADOW BANNED from ACTIVE FISA WARRANTS / SURV?",
 'NAT SEC Sufficient evidence shown to demonstrate rogue elements of intelligence apparatus illegally violated FISA law (tenets) in coordinated effort w/ d+foreign allies to impact/mod the outcome of the 2016 Presidential election & safeguard against future uncover / criminal prosecution?',
 'Who must sign off on FISA warrants?',
 'Who directs the signers?',
 'Given magnitude of spy campaign (U.S. Presidential Election Republican Party Nominee Candidate + President Elect + President of the United States) would HUSSEIN be required to DIRECT ORDER?',
 'How would updates occur?',
 'PDB?',
 'Who has access to the PDB?',
 'On-site CLAS brief

In [526]:
# sorted dict by creating a dict comprehension
{k:v for k,v in zip(sorted(replace_dict.keys()), [replace_dict.get(i) for i in sorted(replace_dict.keys())])}

{' w ': 'with',
 '&': 'and',
 'AUTH': 'authorization',
 'BRENNAN': 'John Brennan',
 'COVID19': 'covid',
 'D': 'Democrats',
 'GOOG': 'Google',
 "Gov't": 'government',
 'HUSSEIN': 'Barack Obama',
 'ID': 'identification',
 'KERRY': 'John Kerry',
 "M's": 'marshalls',
 'MERKEL': 'Angela Merkel',
 'MS-13': 'ms thirteen',
 'MSM': 'mainstream media',
 'No Such Agency': 'NSA',
 'Russia>D': 'Russia Democrats',
 'SA': 'Saudi Arabia',
 'U S Gov t': 'United States government',
 'U.S.': 'United States',
 'US': 'United States',
 '[D]': 'Democratic',
 'comms': 'communications',
 "d's": 'democrats',
 'ds': 'democrats',
 'financial T': 'financial transactions',
 'r v d': 'republicans vs democrats',
 "r's": 'republicans',
 'rs': 'republicans',
 'v': 'versus',
 'v2': 'version two',
 'w/': 'with'}

In [145]:
for i in messages[-4].contents:
    if i.attrs == {'class': ['text']}:
        print([i.string for i in messages[-4].div.contents if i.string is not None])

['Some of us come here to drop crumbs, just crumbs.', 'POTUS', ' is 100% insulated - any discussion suggesting he’s even a target is false.', 'POTUS', ' will not be addressing nation on any of these issues as people begin to be indicted and must remain neutral for pure optical reasons. To suggest this is the plan is false and should be common sense.', 'Focus on Military Intellingence/ State Secrets and why might that be used vs any three letter agency ', 'What ', 'SC', ' decision opened the door for a sitting President to activate - what must be showed?', 'Why is ', 'POTUS', ' surrounded by generals ^^', 'Again, there are a lot more good people than bad so have faith. This was a hostile takeover from an evil corrupt network of players (not just Democrats).', 'Don’t fool yourself into thinking Obama, Soros, Roth’s, Clinton’s etc have more power present day than ', 'POTUS', '. ', 'Operation Mockingbird ', 'Patriots are in control. Sit back and enjoy the show.']


In [146]:
[i.string for i in messages[-4].div.contents if i.string is not None]

['Some of us come here to drop crumbs, just crumbs.',
 'POTUS',
 ' is 100% insulated - any discussion suggesting he’s even a target is false.',
 'POTUS',
 ' will not be addressing nation on any of these issues as people begin to be indicted and must remain neutral for pure optical reasons. To suggest this is the plan is false and should be common sense.',
 'Focus on Military Intellingence/ State Secrets and why might that be used vs any three letter agency ',
 'What ',
 'SC',
 ' decision opened the door for a sitting President to activate - what must be showed?',
 'Why is ',
 'POTUS',
 ' surrounded by generals ^^',
 'Again, there are a lot more good people than bad so have faith. This was a hostile takeover from an evil corrupt network of players (not just Democrats).',
 'Don’t fool yourself into thinking Obama, Soros, Roth’s, Clinton’s etc have more power present day than ',
 'POTUS',
 '. ',
 'Operation Mockingbird ',
 'Patriots are in control. Sit back and enjoy the show.']

In [179]:
Messages.get(2601)

['Justice K >>> Highest Court in the Land.',
 'Law & Order [majority] [U.S. Constitution] safeguarded. ',
 'IT WAS OUR LAST CHANCE TO SAVE IT [Non-Force]',
 'Now comes the real PAIN.',
 'Now comes the real TRUTH.',
 'BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 '\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002\u2002BOOM',
 'They want you DIVIDED.',
 'DIVIDED by RACE.',
 'DIVIDED by RELIGION.',
 'DIVIDED by CULTURE.',
 'DIVIDED by CLASS.',
 'DIVIDED by POLITICAL AFFILIATION. ',
 'DIVIDED YOU ARE WEAK.',
 'TOGETHER YOU ARE STRONG.',
 'This movement challenges their ‘forced’ narrative. ',
 'This movement challenges people to not simply trust what is being reported.',
 'Research for yourself.',
 'Think for yourself.',
 'Trust yourself.',
 'This movement is not about one person o

In [212]:
if messages[2600].children:
    print([i.string for i in messages[2600].children])

['WELCOME TO THE DEMOCRAT PARTY.', 'The Party of threats, violence, intimidation, name-calling, racism, fascism, division, …..', '#WALKAWAY', '#VOTEREPUBLICAN', 'Q']


In [213]:
[i.string for i in messages[2600]]

['WELCOME TO THE DEMOCRAT PARTY.',
 'The Party of threats, violence, intimidation, name-calling, racism, fascism, division, …..',
 '#WALKAWAY',
 '#VOTEREPUBLICAN',
 'Q']

In [463]:
Messages.get(2)

["Shall we play a game?[N]othing [C]an [S]top [W]hat [I]s [C]omingNCSWIChttps://www.cisa.gov/safecom/NCSWICWho stepped down today [forced]?https://www.cisa.gov/bryan-s-wareMore coming?Why is this relevant?How do you 'show' the public the truth?How do you 'safeguard' US elections post-POTUS?How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? It had to be this way.Sometimes you must walk through the darkness before you see the light. Q"]

In [464]:
[p.string for p in messages[2].div.contents if p.string is not None]

['Shall we play a game?',
 '[N]othing [C]an [S]top [W]hat [I]s [C]oming',
 'NCSWIC',
 'Who stepped down today [forced]?',
 'More coming?',
 'Why is this relevant?',
 "How do you 'show' the public the truth?",
 "How do you 'safeguard' US elections post-POTUS?",
 "How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ",
 'It had to be this way.',
 'Sometimes you must walk through the darkness before you see the light. ',
 'Q']

In [465]:
[i for i in messages[2603].contents]

[<p><div class="op" id="op972"><p>&gt;&gt;343</p><p><a href="https://twitter.com/CNNPolitics/status/1048324715469783040" rel="nofollow" target="_blank">https://twitter.com/CNNPolitics/status/1048324715469783040</a></p><p>And so it begins..</p><p>How do you start a WAR?</p><p>[Markers] matter.</p><p>Playbook [FAIL].</p><p>Q</p></div></p>,
 <p>Post Justice K confirmation.</p>,
 <p>RUSSIA RUSSIA RUSSIA</p>,
 <p>Think UK / AUS &gt;&gt; RUSSIA RUSSIA RUSSIA</p>,
 <p>DEFLECT DECLAS </p>,
 <p>DEFLECT BLAME</p>,
 <p>[FOCUS RUSSIA &gt;&gt;&gt; GLOBAL ENEMY]</p>,
 <p>"The Hunt For" dropped - why?</p>,
 <p>When was the unauthorized missile fired?</p>,
 <p>Was it found/discovered prior to [Hunt]?</p>,
 <p>RUSSIA SUB THREAT PUSH?</p>,
 <p>RED_OCTOBER</p>,
 <p>Double meanings exist.</p>,
 <p>……………..</p>,
 <p>Q</p>]

In [540]:
[i.text for i in messages[2602].contents]

['Look HERE [RUSSIA]',
 'DO NOT LOOK HERE [CHINA]',
 'https://www.youtube.com/watch?v=aeVrMniBjSc',
 'Worth 43 minutes of your time.',
 'FAKE NEWS WILL NEVER REPORT.',
 'Important to understand going forward.',
 'FACTS MATTER.',
 'Q']

In [104]:
messages[2602].name, messages[2602].attrs

('div', {'class': ['message']})

In [105]:
messages[-1].contents[0].attrs

{'class': ['text']}

In [467]:
Messages.get(2603)

['Post Justice K confirmation.',
 'RUSSIA RUSSIA RUSSIA',
 'DEFLECT DECLAS ',
 'DEFLECT BLAME',
 '[FOCUS RUSSIA >>> GLOBAL ENEMY]',
 '"The Hunt For" dropped - why?',
 'When was the unauthorized missile fired?',
 'Was it found/discovered prior to [Hunt]?',
 'RUSSIA SUB THREAT PUSH?',
 'RED_OCTOBER',
 'Double meanings exist.',
 '……………..',
 'Q']

We know that Messages.get() isn't returning all of the text

these are a contents of a div-op within a message:

In [375]:
[i.text for i in messages[2603].find('div').contents]

['>>343',
 'https://twitter.com/CNNPolitics/status/1048324715469783040',
 'And so it begins..',
 'How do you start a WAR?',
 '[Markers] matter.',
 'Playbook [FAIL].',
 'Q']

In [472]:
get(2603)

['>>343',
 'https://twitter.com/CNNPolitics/status/1048324715469783040',
 'And so it begins..',
 'How do you start a WAR?',
 '[Markers] matter.',
 'Playbook [FAIL].',
 'Q',
 '>>343',
 'https://twitter.com/CNNPolitics/status/1048324715469783040',
 'And so it begins..',
 'How do you start a WAR?',
 '[Markers] matter.',
 'Playbook [FAIL].',
 'Q']

In [470]:
# this loop gets both the div op contents and the contents of the message which aren't in div op
if messages[0].attrs == {'class': ['message']}:
    for p in messages[0].contents:
        for div_op in p.findAll('div', class_='op'):
            for item in div_op.contents:
                print(item.text)
        if p.string is not None:
            print(p.string)

In [149]:
# this loop gets both the div op contents and the contents of the message which aren't in div op
if messages[3600].attrs == {'class': ['message']}:
    for p in messages[3600].contents:
        for div_op in p.findAll('div', class_='op'):
            for item in div_op.contents:
                if item.string is not None:
                    print(item.string)
        if p.string is not None:
            print(p.string)

https:
//
nationalsecurityaction.org/who-we-are/
We are Americans—former senior officials and policy experts, academics and civil society leaders—who have seen first-hand how the United States is stronger, safer and more respected in the world when we stand strong with our allies, pursue principled diplomacy, and stay true to the values that have long defined America at home and abroad.


In [471]:
alt_get(3600)

['https:',
 '//',
 'nationalsecurityaction.org/who-we-are/',
 'We are Americans—former senior officials and policy experts, academics and civil society leaders—who have seen first-hand how the United States is stronger, safer and more respected in the world when we stand strong with our allies, pursue principled diplomacy, and stay true to the values that have long defined America at home and abroad.',
 'https://nationalsecurityaction.org/who-we-are/We are Americans—former senior officials and policy experts, academics and civil society leaders—who have seen first-hand how the United States is stronger, safer and more respected in the world when we stand strong with our allies, pursue principled diplomacy, and stay true to the values that have long defined America at home and abroad.Well done, Anon.Q']

In [377]:
messages[2603].find('div', class_=['op'])

<div class="op" id="op972"><p>&gt;&gt;343</p><p><a href="https://twitter.com/CNNPolitics/status/1048324715469783040" rel="nofollow" target="_blank">https://twitter.com/CNNPolitics/status/1048324715469783040</a></p><p>And so it begins..</p><p>How do you start a WAR?</p><p>[Markers] matter.</p><p>Playbook [FAIL].</p><p>Q</p></div>

In [378]:
if messages[2].attrs == {'class': ['message']}:
    for item in messages[2].contents:
        for div_op in item.findAll('div', class_='op'):
            for itm in div_op.contents:
                print(itm.string)
        for p_tag in item.findAll('p'):
            if p_tag.string is not None:
                print(p_tag.string)

Shall we play a game?
[N]othing [C]an [S]top [W]hat [I]s [C]oming
NCSWIC
Who stepped down today [forced]?
More coming?
Why is this relevant?
How do you 'show' the public the truth?
How do you 'safeguard' US elections post-POTUS?
How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? 
It had to be this way.
Sometimes you must walk through the darkness before you see the light. 
Q


In [352]:
[i for i in messages[2].div.children]

[<p>Shall we play a game?</p>,
 <p>[N]othing [C]an [S]top [W]hat [I]s [C]oming</p>,
 <p>NCSWIC</p>,
 <p>https:<em>//</em>www.cisa.gov/safecom/NCSWIC</p>,
 <p>Who stepped down today [forced]?</p>,
 <p>https:<em>//</em>www.cisa.gov/bryan-s-ware</p>,
 <p>More coming?</p>,
 <p>Why is this relevant?</p>,
 <p>How do you 'show' the public the truth?</p>,
 <p>How do you 'safeguard' US elections post-POTUS?</p>,
 <p>How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? </p>,
 <p>It had to be this way.</p>,
 <p>Sometimes you must walk through the darkness before you see the light. </p>,
 <p>Q</p>]

In [427]:
if messages[3600].div.div.has_attr('class'):
    print([i.string for i in messages[3600].div.div if i.string is not None])

['https:', '//', 'nationalsecurityaction.org/who-we-are/', 'We are Americans—former senior officials and policy experts, academics and civil society leaders—who have seen first-hand how the United States is stronger, safer and more respected in the world when we stand strong with our allies, pursue principled diplomacy, and stay true to the values that have long defined America at home and abroad.']


In [195]:
messages[2].name == 'div' and messages[2].next_element.attrs == {'class': ['text']}

True

In [196]:
[p.string for p in messages[-1].div if p.string is not None]

['>>147005381',
 'HRC',
 ' extradition already in motion effective yesterday with several countries in case of cross border run. Passport approved to be flagged effective 10/30 @ 12:01am. Expect massive riots organized in defiance and others fleeing the US to occur. US M’s will conduct the operation while ',
 'NG',
 ' activated. Proof check: Locate a ',
 'NG',
 ' member and ask if activated for duty 10/30 across most major cities.']

In [428]:
'p' in [i.name for i in messages[-1].next_element.children]

False

In [198]:
if messages[-1].next_element.attrs == {'class': ['text']}:
    if 'p' in [i.name for i in messages[-1].next_element.children]:
        print(messages[-1].text)
    if 'p' not in [i.name for i in messages[-1].next_element.children]:
        print(messages[-1].text)

>>147005381>>146981635Hillary Clinton will be arrested between 7:45 AM - 8:30 AM EST on Monday - the morning on Oct 30, 2017.HRC extradition already in motion effective yesterday with several countries in case of cross border run. Passport approved to be flagged effective 10/30 @ 12:01am. Expect massive riots organized in defiance and others fleeing the US to occur. US M’s will conduct the operation while NG activated. Proof check: Locate a NG member and ask if activated for duty 10/30 across most major cities.


In [429]:
if messages[2600].attrs == {'class': ['message']}:
    if 'p' in [i.name for i in messages[2600].children]:
        for p_tag in messages[2600].findAll('p'):
            print(p_tag.string)

WELCOME TO THE DEMOCRAT PARTY.
The Party of threats, violence, intimidation, name-calling, racism, fascism, division, …..
#WALKAWAY
#VOTEREPUBLICAN
Q


In [430]:
messages[3]

<div class="message"><div class="text"><p>Nothing can stop what is coming.</p><p>Nothing!</p><p>Q</p></div></div>

In [431]:
[item for item in messages[3099].div.contents]

[<div class="op" id="op1"><br/></div>,
 <br/>,
 'The choice to know will be yours.',
 <br/>,
 'https:',
 <em>//</em>,
 'twitter.com/s8n/status/671489910281498624?lang=en',
 <br/>,
 'Q',
 <br/>]