# 1. Scraping qdrops.online with BeautifulSoup and parsing its content
## 1.1 capturing
the text we want is contained within this tag hiearcharchy: 
- div message -> text -> string if not None & p 
- div message -> div op -> string 
- div message -> abbr title, abbr.text
- meta lar -> span: time, name, source, num

### prematurely calling text or get_text() on div_text will render unnecessary text, text we'd later have to clean

### tags to extract():
- hyperlinks: https?\S+\b, www, twitter, instagram, etc (inevitably will have to regex)
- a href
- figure
- figcaption
- images
- div op containing no text or string
- replace punct with a single space, then replace spaces longer than 1 space with a single space
- it also might make things easier tokenizing them before hand
### cleaning 
- sub hyperlinks
- lower text
- split text 
- sub punctuation
- sub digits 
- join split words back into string
- append strings to list if strings 
- return list of cleaned strings 
## Recurring Problems
### inconsistent tag use: br, p, text, abbr 
- many more tags could have been abbreviated or propertied with its value
- pickling exceeds maximum recursion; solved by sys.get and set a higher recursion limit
- runtime of requests is > 1 min: solved by loading the pickled object
- unwanted text from hyperlinks, figcaptions, etc; solved by using BeautifulSoup extract() on unwanted objects

In [1]:
import requests
import nltk
import os, sys
import itertools
import re, string
import pandas as pd
import pickle
import timeit

from string import punctuation, digits
from collections import Counter
from bs4 import BeautifulSoup, NavigableString, Tag
from string import punctuation, digits
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

punctuation += str('’‘–…“”')
pd.set_option('display.max_colwidth', None)

We don't have to execute the following cell, we can skip to opening the pickled object

In [3]:
'''%%time
base_url = 'https://qposts.online/page/' 
urls = [base_url+str(i) for i in range(1, 105)]
page_requests = [requests.get(url) for url in urls]
soups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]
messages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]
messages_flat = list(itertools.chain.from_iterable(messages))
meta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]'''

"%%time\nbase_url = 'https://qposts.online/page/' \nurls = [base_url+str(i) for i in range(1, 105)]\npage_requests = [requests.get(url) for url in urls]\nsoups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]\nmessages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]\nmessages_flat = list(itertools.chain.from_iterable(messages))\nmeta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]"

In [2]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

In [3]:
index_df = pd.read_pickle('index_df.pkl')
index_df.number = index_df.number.astype('int')
date = [index_df.datetime[i].date() for i in range(0, len(index_df.datetime))]
time = [index_df.datetime[i].time() for i in range(0, len(index_df.datetime))]
dt_index = pd.MultiIndex.from_arrays([date, time])

In [139]:
%%time 
with open('messages_flat.pkl', 'rb') as f:
    messages = pickle.load(f)
with open('meta_flat.pkl', 'rb') as f:
    meta_lar = pickle.load(f)

CPU times: user 15.8 s, sys: 428 ms, total: 16.3 s
Wall time: 16.4 s


In [5]:
%%time
with open('names_joined.pkl', 'rb') as f:
    names = pickle.load(f)
with open('sources_joined.pkl', 'rb') as f:
    sources = pickle.load(f)

CPU times: user 1.84 ms, sys: 1.36 ms, total: 3.2 ms
Wall time: 8.14 ms


In [7]:
class Spans:
    def __init__(self):
        pass

    def nums():
        nums = [meta_lar[i].find('span', 'num').get_text() for i in range(0, len(meta_lar))]
        return nums
                
    def sources():
        sources = [meta_lar[i].find('span', 'source').get_text() for i in range(0, len(meta_lar))]
        links = [meta_lar[i].find('span', 'source').contents[-1].get('href') for i in range(0, len(meta_lar))]
        return sources
    
    def names():
        names = [meta_lar[i].find('span', 'name').get_text() for i in range(0, len(meta_lar))]
        return names
                      
    def dates():
        date_list = [meta_lar[i].find('span', 'time').get_text()for i in range(0, len(meta_lar))]
        dt_idx = pd.to_datetime(date_list, origin='unix', unit='s')
        return dt_idx

In [143]:
class Messages(object):
    def __init__(self):
        pass

    def get(integer: int):
        msg_list = []

        for item in messages[integer].div:
            
            if isinstance(item, NavigableString) and item.name is None:
                msg_list.append(item)
            
            if isinstance(item, Tag) and item.name == 'p':
                msg_list.append(item.string)
            
            if isinstance(item, Tag) and item.name == 'div' and item.attrs == {'class': ['text']}:
                msg_list.append(item.text)

            if isinstance(item, Tag) and item.name == 'abbr':
                msg_list.append(item.text)
            
            for div_images in messages[integer].findAll('div', class_='images'):
                div_images.extract()
            for a_ref in messages[integer].findAll('a', class_='ref'):
                a_ref.extract()
            for a_href in messages[integer].findAll('a', class_='href'):
                a_ref.extract()
            for empty_line in messages[integer].findAll('p', class_='body-line empty'):
                empty_line.extract()
            for br_tag in messages[integer].findAll('br'):
                br_tag.replace_with(' ')

        cleaned = [item for item in msg_list if item != ' ' and item is not None]
        cleaned2x = [item for item in cleaned if not item.startswith('https') and not item.startswith('in.') and not item.startswith('www')]
        return cleaned2x

    def dataframe(integer: int):
        df = pd.DataFrame({'type': [type(i) for i in messages[integer].div.children],
                           'name': [i.name for i in messages[integer].div.children],
                           'content': [i for i in messages[integer].div.children]})
        return df

    def get_abbr(integer:int): 
        return Messages.dataframe(integer)[Messages.dataframe(integer).name == 'abbr']
    
    def info(integer:int):
        return index_df[index_df.number == integer]

    def sents(integer: int):
        return nltk.sent_tokenize(' '.join(Messages.get(integer))) 
      
    def joined(integer: int):
        return ' '.join(Messages.get(integer))
        
    def split(integer: int):
        return Messages.joined(integer).split()

In [8]:
# if an item in Messages.split(i) is in replace_dict.keys(), replace i (the key) with its value
replace_dict = {'r v d': 'republicans vs democrats',
 'rs': 'republicans',
 "r's": 'republicans',
 "d's": 'democrats',
 'ds': 'democrats',
 '[D]': 'Democratic',
 'US': 'United States',
 ' w ': 'with',
 'w/': 'with',
 '&': 'and',
 'MSM': 'mainstream media',
 'ID': 'identification',
 'SA': 'Saudi Arabia',
 'MS-13': 'ms thirteen',
 'COVID19': 'covid',
 "M's": 'marshalls',
 'HUSSEIN': 'Barack Obama',
 'BRENNAN': 'John Brennan',
 'MERKEL': 'Angela Merkel',
 'KERRY': 'John Kerry',
 'v2': 'version two',
 'U.S.': 'United States',
 "Gov't": 'government'}

### using string with p tags saves us subing hyperlinks
### add a conditional statement on the end to pop None from list

In [86]:
def swap(integer: int):
    outter_list, cleaned_list = [], []
    for items in Messages.split(integer):
        if items.startswith('https') or items.startswith('in.') or items.startswith('www'):
            continue
        for char in items:
            if char == '/' or char == '-' or char == '][' or char == '>':
                items = re.sub(char, ' ', items)
        else:
            outter_list.append(items)

    for i in nltk.sent_tokenize(' '.join(outter_list)):
        i = re.sub('[%s]' % re.escape(string.punctuation), '', i)
        i = re.sub('[%s]' % re.escape(string.digits), '', i)
        cleaned_list.append(i)
    words = [nltk.word_tokenize(i.strip()) for i in cleaned_list]
    flat = list(itertools.chain.from_iterable(words))
    return flat

In [169]:
def strip(integer: int):
    sents = []
    for item_list in [nltk.word_tokenize(word) for word in Messages.sents(integer)]:
        for item in item_list:
            if item in string.punctuation or item in string.digits or item in string.digits:
                continue
            else:
                sents.append(item)
    return ' '.join(sents)

In [190]:
def clean(integer: int):
    cleaned = [re.sub('[%s]' % re.escape(string.punctuation), ' ', item) for item in Messages.get(integer)]
    cleaned = [re.sub('[%s]' % re.escape(string.digits), '', item) for item in cleaned]
    cleaned = [item.strip() for item in cleaned]
    cleaned = [item for item in cleaned if item]
    cleaned = [nltk.word_tokenize(item) for item in cleaned]
    cleaned = [' '.join(item) for item in cleaned]
    return cleaned

In [191]:
clean(3005)

['Hassan Rouhani',
 'Who paid HUSSEIN to attend HARVARD LAW SCHOOL',
 'Who is Prince Alwaleed bin Talal',
 'Why would Prince Alwaleed bin Talal Saudi Royal pay HUSSEIN to attend HARVARD LAW SCHOOL',
 'Was HUSSEIN a prominent political figure or a person of influence at the time',
 'No',
 'Who is Valerie Jarrett',
 'Where was she born',
 'When did Valerie Jarrett hire Michelle Robinson',
 'Timeline',
 'Who is Mayor former Richard Daley',
 'Who is Mayor current Rahm Emanuel',
 'HUSSEIN should be VERY nervous',
 'BRENNAN should be VERY nervous',
 'KERRY should be VERY nervous',
 'MERKEL should be VERY nervous',
 'How were the pallets of cash divided',
 'How many planes were used to transport',
 'Who operated the planes',
 'What shadow agency directed operations',
 'Why wasn t the money simply wire transferred',
 'US had AUTH to open bank to bank transfers',
 'How do you prevent financial T logs',
 'How were the cash withdrawals in',
 'EU',
 'categorized labeled',
 'Where did the cash orig

In [171]:
strip(3005)

"Hassan Rouhani Who paid HUSSEIN to attend HARVARD LAW SCHOOL Who is Prince Alwaleed bin Talal Why would Prince Alwaleed bin Talal Saudi Royal pay HUSSEIN to attend HARVARD LAW SCHOOL Was HUSSEIN a prominent political figure or a person of influence at the time No Who is Valerie Jarrett Where was she born When did Valerie Jarrett hire Michelle Robinson 1991 Timeline Who is Mayor former Richard Daley Who is Mayor current Rahm Emanuel HUSSEIN should be VERY nervous BRENNAN should be VERY nervous KERRY should be VERY nervous MERKEL should be VERY nervous +29 How were the pallets of cash divided How many planes were used to transport Who operated the planes What 'shadow agency directed operations Why was n't the money simply wire transferred US had AUTH to open bank-to-bank transfers How do you prevent financial T logs How were the cash withdrawals in EU categorized/labeled Where did the cash originate from What time of day did the withdrawals occur Who provided SECURITY Why was n't Congre

In [41]:
[i.string for i in messages[10].div if i.string is not None]

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [45]:
for i in messages[10].div:
    print(i.name == 'p')

True
True
True
True
True
True
True
True
True
True


In [48]:
[i.name == 'p' for i in messages[10].div]

[True, True, True, True, True, True, True, True, True, True]

In [53]:
[i.name == 'p' for i in messages[10].div.children]

[True, True, True, True, True, True, True, True, True, True]

In [9]:
[i.string for i in messages[10].div if i.string is not None]

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [10]:
[i.string for i in messages[2].div if i.string is not None]

['Shall we play a game?',
 '[N]othing [C]an [S]top [W]hat [I]s [C]oming',
 'NCSWIC',
 'Who stepped down today [forced]?',
 'More coming?',
 'Why is this relevant?',
 "How do you 'show' the public the truth?",
 "How do you 'safeguard' US elections post-POTUS?",
 "How do you 'remove' foreign interference and corruption and install US-owned voter ID law(s) and other safeguards? ",
 'It had to be this way.',
 'Sometimes you must walk through the darkness before you see the light. ',
 'Q']

In [11]:
Messages.dataframe(10)

Unnamed: 0,type,name,content
0,<class 'bs4.element.Tag'>,p,"[https:, [//], twitter.com/BrentScher/status/1322015793593360384]"
1,<class 'bs4.element.Tag'>,p,[Fact checkers created in effort to reinforce propaganda [digestion]?]
2,<class 'bs4.element.Tag'>,p,[The battle to prevent truth from reaching the people.]
3,<class 'bs4.element.Tag'>,p,[The battle to maintain and push division.]
4,<class 'bs4.element.Tag'>,p,[Divided you are weak.]
5,<class 'bs4.element.Tag'>,p,[Divided you fight each other.]
6,<class 'bs4.element.Tag'>,p,[Divided you pose no threat.]
7,<class 'bs4.element.Tag'>,p,[System of control.]
8,<class 'bs4.element.Tag'>,p,[Information warfare.]
9,<class 'bs4.element.Tag'>,p,[Q]


In [12]:
Messages.dataframe(2500)

Unnamed: 0,type,name,content
0,<class 'bs4.element.Tag'>,div,"[[[<img data-src=""/assets/images/127cd473f34edb4b3107629dcd3ec4cc73b1b306993e5877397c9cf7d6fe5c2c.png"" src=""/img/loading.svg""/>], [sgrZzEo.png]]]"
1,<class 'bs4.element.Tag'>,p,"[Thank you for your service to our Country, Mr. Trey Gowdy!]"
2,<class 'bs4.element.Tag'>,p,"[Thank you for your service to our Country, Mr. Bob Goodlatte!]"
3,<class 'bs4.element.Tag'>,p,[Your sacrifices will never be forgotten. ]
4,<class 'bs4.element.Tag'>,p,[Q+]


In [13]:
Messages.dataframe(3005)

Unnamed: 0,type,name,content
0,<class 'bs4.element.NavigableString'>,,https:
1,<class 'bs4.element.Tag'>,em,[//]
2,<class 'bs4.element.NavigableString'>,,in.reuters.com/article/iran-economy-rouhani-sanctions/iran-parliament-censures-rouhani-in-sign-pragmatists-losing-sway-idINKCN1LD0DL
3,<class 'bs4.element.Tag'>,br,[]
4,<class 'bs4.element.NavigableString'>,,[Hassan Rouhani]
...,...,...,...
127,<class 'bs4.element.Tag'>,br,[]
128,<class 'bs4.element.NavigableString'>,,Nothing to See Here.
129,<class 'bs4.element.Tag'>,br,[]
130,<class 'bs4.element.NavigableString'>,,Q


In [14]:
 Messages.dataframe(3005).drop(Messages.dataframe(3005)[Messages.dataframe(3005).name == 'br'].index)

Unnamed: 0,type,name,content
0,<class 'bs4.element.NavigableString'>,,https:
1,<class 'bs4.element.Tag'>,em,[//]
2,<class 'bs4.element.NavigableString'>,,in.reuters.com/article/iran-economy-rouhani-sanctions/iran-parliament-censures-rouhani-in-sign-pragmatists-losing-sway-idINKCN1LD0DL
4,<class 'bs4.element.NavigableString'>,,[Hassan Rouhani]
6,<class 'bs4.element.NavigableString'>,,Who paid HUSSEIN to attend HARVARD LAW SCHOOL?
...,...,...,...
122,<class 'bs4.element.NavigableString'>,,When was the Joint Plan of Action (IRAN DEAL) executed?
124,<class 'bs4.element.NavigableString'>,,Late 2013?
126,<class 'bs4.element.NavigableString'>,,Do you believe in coincidences?
128,<class 'bs4.element.NavigableString'>,,Nothing to See Here.


In [98]:
Messages.get(10)

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [97]:
list(itertools.chain.from_iterable([nltk.sent_tokenize(i) for i in Messages.get(10)]))

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [17]:
[i.name for i in meta_lar[1].find('span', 'name').children]

['strong', None]

In [18]:
[(i.name, i.attrs) for i in meta_lar[0].children]

[('span', {'class': ['num']}),
 ('span', {'class': ['time']}),
 ('span', {'class': ['name']}),
 ('span', {'class': ['source']}),
 ('span', {'class': ['copy']})]

In [19]:
#drop entire row if name == br?
type(Messages.dataframe(4950))

pandas.core.frame.DataFrame

In [21]:
index_list = [i for i,e in enumerate(Messages.split(4800)) if e in replace_dict.keys()]

In [22]:
[e for i,e in enumerate(Messages.split(4800)) if i == 182]

['highly']

In [23]:
[e for i,e in enumerate(Messages.split(4800)) if i in index_list]

['&', 'MSM', 'ID', 'SA']

In [24]:
replace_dict.get('&')

'and'

Replace an abbr with its value in the replacement_dict: 

for scanner function: 

In [25]:
# How we will implement the swap when the words are broken down as far as they can go, ie when there aren't any - or / separating words
for item in Messages.split(4000):
    if item in replace_dict.keys():
        item =  replace_dict.get(item)
        print(item)
    else:
        print(item)

How
bad
is
the
corruption?
FBI
(past/present)#1#1
#2+29
(16)DOJ
(past/present)#1#1#2
+18
STATE
(past/present)#1#1+41Removal
is
the
least
of
their
problems.Projection.Russia>D/HRCTwitter
Bots>GOOG
operated
(not
Russia)/Narrative
and
Political
SLANTBIDEN
/
CHINA.BIG
DEVELOPMENT.TRAITORS
EVERYWHERE.AMERICA
FOR
SALE.FLYNN.Targeted.Why?Who
knows
where
the
bodies
are
buried?CLEARED
OF
ALL
CHARGES.TRUMP
ADMIN
v2?Election
theft.Last
hope.Congressional
focus.Impeach.They
think
you
are
STUPID.They
think
you
will
follow
the
STARS.They
openly
call
you
SHEEP/CATTLE.THERE
WILL
COME
A
TIME
NONE
OF
THEM
WILL
BE
ABLE
TO
WALK
DOWN
THE
STREET.BIGGEST
FEAR.PUBLIC
AWAKENING.Q


In [87]:
for item in swap(4000):
    if item in replace_dict.keys():
        item =  replace_dict.get(item)
        print(item)
    else:
        print(item)

How
bad
is
the
corruption
FBI
past
present
DOJ
past
present
STATE
past
present
Removal
is
the
least
of
their
problems
Projection
Russia
D
HRC
Twitter
Bots
GOOG
operated
not
Russia
Narrative
Political
SLANT
BIDEN
CHINA
BIG
DEVELOPMENT
TRAITORS
EVERYWHERE
AMERICA
FOR
SALE
FLYNN
Targeted
Why
Who
knows
where
the
bodies
are
buried
CLEARED
OF
ALL
CHARGES
TRUMP
ADMIN
v
Election
theft
Last
hope
Congressional
focus
Impeach
They
think
you
are
STUPID
They
think
you
will
follow
the
STARS
They
openly
call
you
SHEEP
CATTLE
THERE
WILL
COME
A
TIME
NONE
OF
THEM
WILL
BE
ABLE
TO
WALK
DOWN
THE
STREET
BIGGEST
FEAR
PUBLIC
AWAKENING
Q


In [88]:
replace_dict.update({'v': 'versus', 'D': 'Democrats', 'GOOG': 'Google'})

In [91]:
replace_dict

{'r v d': 'republicans vs democrats',
 'rs': 'republicans',
 "r's": 'republicans',
 "d's": 'democrats',
 'ds': 'democrats',
 '[D]': 'Democratic',
 'US': 'United States',
 ' w ': 'with',
 'w/': 'with',
 '&': 'and',
 'MSM': 'mainstream media',
 'ID': 'identification',
 'SA': 'Saudi Arabia',
 'MS-13': 'ms thirteen',
 'COVID19': 'covid',
 "M's": 'marshalls',
 'HUSSEIN': 'Barack Obama',
 'BRENNAN': 'John Brennan',
 'MERKEL': 'Angela Merkel',
 'KERRY': 'John Kerry',
 'v2': 'version two',
 'U.S.': 'United States',
 "Gov't": 'government',
 'comms': 'communications',
 'financial T': 'financial transactions',
 'No Such Agency': 'NSA',
 'v': 'versus',
 'D': 'Democrats',
 'GOOG': 'Google'}

In [85]:
index_df[index_df.number == 4000]

Unnamed: 0,number,datetime,name,source
953,4000,2020-04-29 00:58:18,Q !!Hs1Jq13jV6,8kun/qresearch8953725


In [26]:
Messages.get_abbr(3005)['content'].explode()

59           EU
89     @Snowden
95     @Snowden
103    @Snowden
109         NSA
119          OP
Name: content, dtype: object

this should be the new split logic; what happens otherwise is that the link will be broken up

this way, we can edit it out in one pass 


In [119]:
[i for i in Messages.get(integer) if not i.startswith('https') and not i.startswith('in.') and not i.startswith('www.')]

['[Hassan Rouhani]',
 'Who paid HUSSEIN to attend HARVARD LAW SCHOOL?',
 'Who is Prince Alwaleed bin Talal?',
 'Why would Prince Alwaleed bin Talal (Saudi Royal) pay HUSSEIN to attend HARVARD LAW SCHOOL?',
 'Was HUSSEIN a prominent political figure or a person of influence at the time?',
 'No.',
 'Who is Valerie Jarrett? ',
 'Where was she born?',
 'When did Valerie Jarrett hire Michelle Robinson?',
 '1991',
 'Timeline.',
 'Who is Mayor (former) Richard Daley?',
 'Who is Mayor (current) Rahm Emanuel?',
 'HUSSEIN should be VERY nervous.',
 'BRENNAN should be VERY nervous.',
 'KERRY should be VERY nervous.',
 'MERKEL should be VERY nervous.',
 '+29',
 'How were the pallets of cash divided?',
 'How many planes were used to transport? ',
 'Who operated the planes?',
 "What 'shadow' agency directed operations?",
 "Why wasn't the money [simply] wire transferred?",
 'US had AUTH to open bank-to-bank transfers.',
 'How do you prevent financial T logs?',
 'How were the cash withdrawals in ',
 '

In [109]:
[Messages.sents(3005)[i].split() for i in range(0, len(Messages.sents(3005)))]

[['https:',
  'in.reuters.com/article/iran-economy-rouhani-sanctions/iran-parliament-censures-rouhani-in-sign-pragmatists-losing-sway-idINKCN1LD0DL',
  '[Hassan',
  'Rouhani]',
  'Who',
  'paid',
  'HUSSEIN',
  'to',
  'attend',
  'HARVARD',
  'LAW',
  'SCHOOL?'],
 ['Who', 'is', 'Prince', 'Alwaleed', 'bin', 'Talal?'],
 ['Why',
  'would',
  'Prince',
  'Alwaleed',
  'bin',
  'Talal',
  '(Saudi',
  'Royal)',
  'pay',
  'HUSSEIN',
  'to',
  'attend',
  'HARVARD',
  'LAW',
  'SCHOOL?'],
 ['Was',
  'HUSSEIN',
  'a',
  'prominent',
  'political',
  'figure',
  'or',
  'a',
  'person',
  'of',
  'influence',
  'at',
  'the',
  'time?'],
 ['No.'],
 ['Who', 'is', 'Valerie', 'Jarrett?'],
 ['Where', 'was', 'she', 'born?'],
 ['When', 'did', 'Valerie', 'Jarrett', 'hire', 'Michelle', 'Robinson?'],
 ['1991', 'Timeline.'],
 ['https:',
  'www.thisisinsider.com/how-did-barack-and-michelle-obama-meet-2017-10#1991-they-got-engaged-in-a-simple-and-sweet-way-3',
  'Who',
  'is',
  'Mayor',
  '(former)',
  '

In [105]:
[i.split() for i in Messages.get(10)]

[['Fact',
  'checkers',
  'created',
  'in',
  'effort',
  'to',
  'reinforce',
  'propaganda',
  '[digestion]?'],
 ['The',
  'battle',
  'to',
  'prevent',
  'truth',
  'from',
  'reaching',
  'the',
  'people.'],
 ['The', 'battle', 'to', 'maintain', 'and', 'push', 'division.'],
 ['Divided', 'you', 'are', 'weak.'],
 ['Divided', 'you', 'fight', 'each', 'other.'],
 ['Divided', 'you', 'pose', 'no', 'threat.'],
 ['System', 'of', 'control.'],
 ['Information', 'warfare.'],
 ['Q']]

In [None]:
# Messages.split() should return a flat list of words that've been split
# Tokenize the sentences into words
# are any of the items in the list also in replace_dict.keys()? If they are, swap out?

In [81]:
[i for i in Messages.sents(3009) if not i.startswith('https')]

['FISA  (ABCs) v INSCOM NOFORN NSA  INSCOM BRIDGE FISA  = FISC Who is accountable to Congress (civilian body)?',
 "Define 'State Secrets'.",
 'Process of obtaining a  FISA  warrant?',
 'What must be DEMONSTRATED to be GRANTED?',
 'Who must SIGN OFF?',
 "Can 'select' individuals in senior positions of power be SHADOW BANNED from ACTIVE  FISA  WARRANTS / SURV?",
 'NAT  SEC Sufficient evidence shown to demonstrate rogue elements of intelligence apparatus illegally violated  FISA  law (tenets) in coordinated effort w/ d+foreign allies to impact/mod the outcome of the 2016 Presidential election & safeguard against future uncover / criminal prosecution?',
 'Who must sign off on  FISA  warrants?',
 'Who directs the signers?',
 'Given magnitude of spy campaign (U.S. Presidential Election Republican Party Nominee Candidate + President Elect + President of the United States) would HUSSEIN be required to DIRECT ORDER?',
 'How would updates occur?',
 'PDB?',
 'Who has access to the PDB?',
 'On-sit

In [77]:
swap(-1)

['HRC',
 'extradition',
 'already',
 'in',
 'motion',
 'effective',
 'yesterday',
 'with',
 'several',
 'countries',
 'in',
 'case',
 'of',
 'cross',
 'border',
 'run',
 'Passport',
 'approved',
 'to',
 'be',
 'flagged',
 'effective',
 'am',
 'Expect',
 'massive',
 'riots',
 'organized',
 'in',
 'defiance',
 'and',
 'others',
 'fleeing',
 'the',
 'US',
 'to',
 'occur',
 'US',
 'M',
 '’',
 's',
 'will',
 'conduct',
 'the',
 'operation',
 'while',
 'NG',
 'activated',
 'Proof',
 'check',
 'Locate',
 'a',
 'NG',
 'member',
 'and',
 'ask',
 'if',
 'activated',
 'for',
 'duty',
 'across',
 'most',
 'major',
 'cities']

In [90]:
replace_dict.update({'comms': 'communications'})

In [None]:
# after the chars are joined back together, if the item is in the replace_dict.keys():
# item = replace_dict.get(item)
# THEN the items can be joined back together to be sent tokenized

In [33]:
replace_dict.update({'financial T': 'financial transactions'})

In [34]:
replace_dict.update({'No Such Agency': 'NSA'})

In [35]:
'No Such Agency' in replace_dict.keys()

True

In [36]:
"Gov't" in replace_dict.keys()

True

In [144]:
def check_string(integer: int):
    length = len(Messages.split(integer))
    
    for s in Messages.split(integer):  
        str_length = [len(s) for s in Messages.split(3005)]
        word_idx = Messages.split(integer).index(s)

        for char in s:           
            for idx in range(0, length):
                char_list = [char for char in Messages.split(integer)[idx] if char in string.punctuation]
                char_idx = [i for i,e in enumerate(Messages.split(integer)[idx]) if e in string.punctuation]

        return str_length

index of characters in the string which are punctuation

In [38]:
[i for i,e in enumerate(swap(3005)) if e in replace_dict.keys()]

[4, 25, 32, 74, 79, 84, 89, 124, 173, 179, 224]

In [39]:
[e for i,e in enumerate(swap(3005)) if e in replace_dict.keys()]

['HUSSEIN',
 'HUSSEIN',
 'HUSSEIN',
 'HUSSEIN',
 'BRENNAN',
 'KERRY',
 'MERKEL',
 'US',
 'US',
 'US',
 'comms']

index and characters on the first string as a tuple

In [107]:
[(i,e) for i,e in enumerate(Messages.split(3005)[0]) if e in string.punctuation]

[(5, ':'),
 (6, '/'),
 (7, '/'),
 (10, '.'),
 (18, '.'),
 (22, '/'),
 (30, '/'),
 (35, '-'),
 (43, '-'),
 (51, '-'),
 (61, '/'),
 (66, '-'),
 (77, '-'),
 (86, '-'),
 (94, '-'),
 (97, '-'),
 (102, '-'),
 (114, '-'),
 (121, '-'),
 (126, '-')]

In [93]:
'AUTH' in replace_dict.keys()

False

In [92]:
# sorted dict by creating a dict comprehension
{k:v for k,v in zip(sorted(replace_dict.keys()), [replace_dict.get(i) for i in sorted(replace_dict.keys())])}

{' w ': 'with',
 '&': 'and',
 'BRENNAN': 'John Brennan',
 'COVID19': 'covid',
 'D': 'Democrats',
 'GOOG': 'Google',
 "Gov't": 'government',
 'HUSSEIN': 'Barack Obama',
 'ID': 'identification',
 'KERRY': 'John Kerry',
 "M's": 'marshalls',
 'MERKEL': 'Angela Merkel',
 'MS-13': 'ms thirteen',
 'MSM': 'mainstream media',
 'No Such Agency': 'NSA',
 'SA': 'Saudi Arabia',
 'U.S.': 'United States',
 'US': 'United States',
 '[D]': 'Democratic',
 'comms': 'communications',
 "d's": 'democrats',
 'ds': 'democrats',
 'financial T': 'financial transactions',
 'r v d': 'republicans vs democrats',
 "r's": 'republicans',
 'rs': 'republicans',
 'v': 'versus',
 'v2': 'version two',
 'w/': 'with'}