# 1. Scraping qdrops.online with BeautifulSoup and parsing its content
## 1.1 capturing
the text we want is contained within this tag hiearcharchy: 
- div message -> text -> string if not None & p 
- div message -> div op -> string 
- div message -> abbr title, abbr.text
- meta lar -> span: time, name, source, num

### prematurely calling text or get_text() on div_text will render unnecessary text, text we'd later have to clean

### tags to extract():
- hyperlinks: https?\S+\b, www, twitter, instagram, etc (inevitably will have to regex)
- a href
- figure
- figcaption
- images
- div op containing no text or string
- replace punct with a single space, then replace spaces longer than 1 space with a single space
- it also might make things easier tokenizing them before hand
### cleaning 
- sub hyperlinks
- lower text
- split text 
- sub punctuation
- sub digits 
- join split words back into string
- append strings to list if strings 
- return list of cleaned strings 
## Recurring Problems
### inconsistent tag use: br, p, text, abbr 
- many more tags could have been abbreviated or propertied with its value
- pickling exceeds maximum recursion; solved by sys.get and set a higher recursion limit
- runtime of requests is > 1 min: solved by loading the pickled object
- unwanted text from hyperlinks, figcaptions, etc; solved by using BeautifulSoup extract() on unwanted objects

In [1]:
import requests
import nltk
import os, sys
import itertools
import re, string
import pandas as pd
import pickle
import timeit

from string import punctuation, digits
from collections import Counter
from bs4 import BeautifulSoup, NavigableString, Tag
from string import punctuation, digits
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

punctuation += str('’‘–…“”')
pd.set_option('display.max_colwidth', None)

We don't have to execute the following cell, we can skip to opening the pickled object

In [3]:
'''%%time
base_url = 'https://qposts.online/page/' 
urls = [base_url+str(i) for i in range(1, 105)]
page_requests = [requests.get(url) for url in urls]
soups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]
messages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]
messages_flat = list(itertools.chain.from_iterable(messages))
meta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]'''

"%%time\nbase_url = 'https://qposts.online/page/' \nurls = [base_url+str(i) for i in range(1, 105)]\npage_requests = [requests.get(url) for url in urls]\nsoups = [BeautifulSoup(page.text, 'html.parser') for page in page_requests]\nmessages_original = [soups[i].findAll('div', 'message') for i in range(0, len(soups))]\nmessages_flat = list(itertools.chain.from_iterable(messages))\nmeta_lar = [soups[i].findAll('div', 'meta lar') for i in range(0, len(soups))]"

In [2]:
os.getcwd()

'/Users/kylereaves/Documents/GitHub/parsing_madness'

In [3]:
index_df = pd.read_pickle('index_df.pkl')
index_df.number = index_df.number.astype('int')
date = [index_df.datetime[i].date() for i in range(0, len(index_df.datetime))]
time = [index_df.datetime[i].time() for i in range(0, len(index_df.datetime))]
dt_index = pd.MultiIndex.from_arrays([date, time])

In [4]:
%%time 
with open('messages_flat.pkl', 'rb') as f:
    messages = pickle.load(f)
with open('meta_flat.pkl', 'rb') as f:
    meta_lar = pickle.load(f)

CPU times: user 12.8 s, sys: 477 ms, total: 13.2 s
Wall time: 13.4 s


In [5]:
%%time
with open('names_joined.pkl', 'rb') as f:
    names = pickle.load(f)
with open('sources_joined.pkl', 'rb') as f:
    sources = pickle.load(f)

CPU times: user 2.02 ms, sys: 1.43 ms, total: 3.45 ms
Wall time: 8.54 ms


In [8]:
class Messages(object):
    def __init__(self):
        pass

    def get(integer: int):
        msg_list = []
        for item in messages[integer]:
            
            if isinstance(item, NavigableString) and item.name is None:
                msg_list.append(item)
            
            if isinstance(item, Tag) and item.name == 'p':
                msg_list.append(item.string)
            
            if isinstance(item, Tag) and item.name == 'div' and item.has_attr('class'):
                contents = item.contents
                for content in contents:
                    if content.name == 'p':
                        msg_list.append(content.string)
                    if content.name == 'div':
                        msg_list.append(content.text)
                    if content.name == 'abbr':
                        msg_list.append(content.text)
                    if isinstance(content, NavigableString):
                        msg_list.append(content)
            
            for div_images in messages[integer].findAll('div', class_='images'):
                div_images.extract()
            for a_ref in messages[integer].findAll('a', class_='ref'):
                a_ref.extract()
            for a_href in messages[integer].findAll('a', class_='href'):
                a_ref.extract()
            for empty_line in messages[integer].findAll('p', class_='body-line empty'):
                empty_line.extract()
            for br_tag in messages[integer].findAll('br'):
                br_tag.replace_with(' ')

        cleaned = [item for item in msg_list if item !=
                   ' ' and item is not None]

        return cleaned

    def dataframe(integer: int):
        df = pd.DataFrame({'type': [type(i) for i in messages[integer].div.children],
                           'name': [i.name for i in messages[integer].div.children],
                           'content': [i for i in messages[integer].div.children]})
        return df
    
    def get_abbr(integer:int): 
        return Messages.dataframe(integer)[Messages.dataframe(integer).name == 'abbr']
    
    def sents(integer: int):
        return nltk.sent_tokenize(' '.join(Messages.get(integer)))

    def joined(integer: int):
        return ' '.join(Messages.get(integer))

    def split(integer: int):
        return Messages.joined(integer).split()

In [9]:
class Spans:
    def __init__(self):
        pass

    def nums():
        nums = [meta_lar[i].find('span', 'num').get_text() for i in range(0, len(meta_lar))]
        return nums
                
    def sources():
        sources = [meta_lar[i].find('span', 'source').get_text() for i in range(0, len(meta_lar))]
        links = [meta_lar[i].find('span', 'source').contents[-1].get('href') for i in range(0, len(meta_lar))]
        return sources
    
    def names():
        names = [meta_lar[i].find('span', 'name').get_text() for i in range(0, len(meta_lar))]
        return names
                      
    def dates():
        date_list = [meta_lar[i].find('span', 'time').get_text()for i in range(0, len(meta_lar))]
        dt_idx = pd.to_datetime(date_list, origin='unix', unit='s')
        return dt_idx

In [10]:
# if an item in Messages.split(i) is in replace_dict.keys(), replace i (the key) with its value
replace_dict = {
    'r v d': 'republicans vs democrats',
    'rs': 'republicans',
    "r's": 'republicans',
    "d's": 'democrats',
    'ds': 'democrats',
    '[D]': 'Democratic',
    'US': 'United States',
    ' w ': 'with',
    'w/' : 'with',
    '&' : 'and',
    'MSM': 'mainstream media',
    'ID': 'identification',
    'SA': 'Saudi Arabia',    
    'MS-13': 'ms thirteen',
    'COVID19': 'covid',
    "M's": 'marshalls'
}

### using string with p tags saves us subing hyperlinks
### add a conditional statement on the end to pop None from list

In [11]:
[i.text for i in messages[10].div.children]

['https://twitter.com/BrentScher/status/1322015793593360384',
 'Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [12]:
[i.string for i in messages[10].div.children if i.string is not None]

['Fact checkers created in effort to reinforce propaganda [digestion]?',
 'The battle to prevent truth from reaching the people.',
 'The battle to maintain and push division.',
 'Divided you are weak.',
 'Divided you fight each other.',
 'Divided you pose no threat.',
 'System of control.',
 'Information warfare.',
 'Q']

In [13]:
[i.name for i in meta_lar[1].find('span', 'name').children]

['strong', None]

In [14]:
[(i.name, i.attrs) for i in meta_lar[0].children]

[('span', {'class': ['num']}),
 ('span', {'class': ['time']}),
 ('span', {'class': ['name']}),
 ('span', {'class': ['source']}),
 ('span', {'class': ['copy']})]

In [15]:
#drop entire row if name == br?
type(Messages.dataframe(4950))

pandas.core.frame.DataFrame

Now that we have the index taken care of, we can now focus on how to replace

abbr keys with their values in the the dictionary we've been adding too

Remember the are abbrs we want to skip as an exception: POTUS, FBI, HRC, abbrs

whose meaning is obvious and likely won't cloud our analysis of the language 

.index() returns the index of the values first appearance in a list, not all of the indices

In [16]:
for item in Messages.split(4800):
    if item in replace_dict.keys():
        print(item, Messages.split(4800).index(item))

& 182
MSM 216
ID 246
ID 246
SA 290


In [24]:
index_list = [i for i,e in enumerate(Messages.split(4800)) if e in replace_dict.keys()]

In [23]:
[e for i,e in enumerate(Messages.split(4800)) if i == 182]

['&']

In [25]:
[e for i,e in enumerate(Messages.split(4800)) if i in index_list]

['&', 'ID', 'ID']

In [26]:
replace_dict.get('&')

'and'

In [51]:
for item in Messages.split(4000):
    if item in replace_dict.keys():
        item =  replace_dict.get(item)
        print(item)
    else:
        print(item)

How
bad
is
the
corruption?
FBI
(past/present)
#1
#1
#2
+29
(16)
DOJ
(past/present)
#1
#1
#2
+18
STATE
(past/present)
#1
#1
+41
Removal
is
the
least
of
their
problems.
Projection.
Russia>D/
HRC
Twitter
Bots>
GOOG
operated
(not
Russia)/Narrative
and
Political
SLANT
BIDEN
/
CHINA.
BIG
DEVELOPMENT.
TRAITORS
EVERYWHERE.
AMERICA
FOR
SALE.
FLYNN.
Targeted.
Why?
Who
knows
where
the
bodies
are
buried?
CLEARED
OF
ALL
CHARGES.
TRUMP
ADMIN
v2?
Election
theft.
Last
hope.
Congressional
focus.
Impeach.
They
think
you
are
STUPID.
They
think
you
will
follow
the
STARS.
They
openly
call
you
SHEEP/CATTLE.
THERE
WILL
COME
A
TIME
NONE
OF
THEM
WILL
BE
ABLE
TO
WALK
DOWN
THE
STREET.
BIGGEST
FEAR.
PUBLIC
AWAKENING.
Q


In [18]:
Messages.get_abbr(4800)

Unnamed: 0,type,name,content
3,<class 'bs4.element.Tag'>,abbr,[POTUS]
27,<class 'bs4.element.Tag'>,abbr,[HRC]
33,<class 'bs4.element.Tag'>,abbr,[MSM]
43,<class 'bs4.element.Tag'>,abbr,[ID]
47,<class 'bs4.element.Tag'>,abbr,[ID]
57,<class 'bs4.element.Tag'>,abbr,[SA]


In [20]:
Messages.sents(4000)

['How bad is the corruption?',
 'FBI  (past/present) #1 #1  #2 +29 (16) DOJ  (past/present) #1 #1 #2  +18  STATE (past/present) #1 #1 +41 Removal is the least of their problems.',
 'Projection.',
 'Russia>D/ HRC Twitter Bots> GOOG  operated (not Russia)/Narrative & Political SLANT BIDEN / CHINA.',
 'BIG DEVELOPMENT.',
 'TRAITORS EVERYWHERE.',
 'AMERICA FOR SALE.',
 'FLYNN.',
 'Targeted.',
 'Why?',
 'Who knows where the bodies are buried?',
 'CLEARED OF ALL CHARGES.',
 'TRUMP ADMIN v2?',
 'Election theft.',
 'Last hope.',
 'Congressional focus.',
 'Impeach.',
 'They think you are STUPID.',
 'They think you will follow the STARS.',
 'They openly call you SHEEP/CATTLE.',
 'THERE WILL COME A TIME NONE OF THEM WILL BE ABLE TO WALK DOWN THE STREET.',
 'BIGGEST FEAR.',
 'PUBLIC AWAKENING.',
 'Q']

In [33]:
index_df[index_df.number == 4000]

Unnamed: 0,number,datetime,name,source
953,4000,2020-04-29 00:58:18,Q !!Hs1Jq13jV6,8kun/qresearch8953725
