# Scraping Content from Results

This section will be used to parse the results file in `data/` and create a csv with the required outputs.

In [None]:
import numpy as np
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import WebDriverException

## Sputnik Globe

In [None]:
# Get urls
df = pd.read_csv('../data/results_sputnikglobe_20201201_20211130.csv')
df['text'] = None
df['pull_time'] = None
df['keywords'] = None
df['analytics_keywords'] = None

# Set sleep times
mu, sigma = 0.5, 1. # mean and standard deviation
s = np.random.lognormal(mu, sigma, df.shape[0])

# Optionally plot sleep times
def plot_sleep_times(s=s):
    import matplotlib.pyplot as plt
    count, bins, ignored = plt.hist(s, 600, density=True, align='mid')

    x = np.linspace(min(bins), max(bins), 10000)
    pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))
        / (x * sigma * np.sqrt(2 * np.pi)))

    plt.plot(x, pdf, linewidth=2, color='r')
    plt.axis('tight')
    plt.show()
    return
# plot_sleep_times()

In [119]:
def refresh_driver(driver:webdriver.Firefox=None, e:Exception=None):
    if e:
        print(f'Refreshing driver after exception {e}.')
    else:
        print('Refreshing driver.')
    if driver:
        try:
            driver.quit()
        except WebDriverException as e:
            print('No driver to quit.')
    new_driver = webdriver.Firefox()
    return new_driver

In [None]:
# Initialize driver
driver = refresh_driver()

# filter out the urls that have successfully been pulled already and store in temporary dataframe
temp_df = df.loc[(df['text']==None) | (df['pull_time']==None) | (df['keywords']==None) | (df['analytics_keywords']==None)]

# on by one, visit site, pull source, and store tags of interest in the original dataframe
for i, row_url in enumerate(temp_df['url']):
    df_index = df.loc[df['url']==row_url].index.values
    df.at[df_index, 'pull_time'] = time.strftime('%Y-%m-%dT%H:%M:%S', time.gmtime())
    try:
        driver.get(row_url)
    except WebDriverException as e:
        df.at[df_index, 'text'] = f'[ERROR] The url is not reachable at this time.'
        df.at[df_index, 'keywords'] = f'[ERROR] The url is not reachable at this time.'
        df.at[df_index, 'analytics_keywords'] = f'[ERROR] The url is not reachable at this time.'
        driver = refresh_driver(driver=driver, e=e)
        continue
    soup = BeautifulSoup(driver.page_source)
    url_keywords = soup.find(name='meta', attrs={'name':'keywords'})
    if url_keywords:
        df.at[df_index, 'keywords'] = url_keywords['content']
    else:
        df.at[df_index, 'keywords'] = 'No keywords found.'
    url_analytics_keywords = soup.find(name='meta', attrs={'name':'analytics:keyw'})
    if url_analytics_keywords:
        df.at[df_index, 'analytics_keywords'] = url_analytics_keywords['content']
    else:
        df.at[df_index, 'analytics_keywords'] = 'No analytics keywords found.'
    text_header = soup.find(name='div', attrs={'class':'article__announce-text'})
    text_body = soup.find(name='div', attrs={'class':'article__body'})
    df.at[df_index, 'text'] = text_header.text + text_body.text
    if i%200 == 0:
        print(f'{round(i/df.shape[0], 2)}% of {temp_df.shape[0]} completed...')
        driver = refresh_driver(driver=driver)
    time.sleep(s[i])
driver.quit()


0.0% completed...
0.04441483455474128% completed...


NameError: name 'WebDriverException' is not defined

Checking to see if things look okay.

In [118]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,url,title,text,pull_time,keywords,analytics_keywords
0,0,20210430,https://sputnikglobe.com/20210430/biden-us-mil...,Biden: US Military Brass Will Decide Whether t...,"Since the onset of the COVID-19 pandemic, the ...",2025-04-12T21:46:33,"military & intelligence, newsfeed, joe biden, ...","military, news, tag_JoeBiden, organization_US_..."
1,1,20210430,https://sputnikglobe.com/20210430/russian-vacc...,Russian Vaccines Protect From All Known Strain...,"Since the onset of the COVID-19 pandemic, the ...",2025-04-12T21:46:45,,"military, news, tag_JoeBiden, organization_US_..."
2,2,20210430,https://sputnikglobe.com/20210430/hope-theres-...,‘Hope There's No Quid Pro Quo’: Is US Expectin...,The US said on Thursday that it would send med...,2025-04-12T21:46:50,"newsfeed, china, joe biden, afghanistan, us","world, news, geo_China, tag_JoeBiden, covid-19..."
3,3,20210430,https://sputnikglobe.com/20210430/albania-appr...,Albania Approves Use of Russian COVID-19 Vacci...,MOSCOW (Sputnik) - Albania has approved the us...,2025-04-12T21:46:54,"newsfeed, russian direct investment fund (rdif...","news, world, tag_RussianDirectInvestmentFundRD..."
4,4,20210430,https://sputnikglobe.com/20210430/yogi-adityan...,Will Yogi Adityanath Fight Back After Supreme ...,"The Indian government, led by Prime Minister N...",2025-04-12T21:47:01,"newsfeed, indians, coronavirus, supreme court,...","news, world, keyword_Indians, organization_Ind..."


Notice that the first two rows have identical text, keywords, and analytics_keywords (from what is visible). 
Are there other wierd duplicates?

In [91]:
dup_counts = df.groupby(['keywords'], as_index=False, dropna=False).size().sort_values('size')
dups = dup_counts.loc[dup_counts['size']>1]
dups

Unnamed: 0,keywords,size
283,"us, newsfeed",2
246,"newsfeed, world health organization (who)",2
160,"newsfeed, middle east, iran",2
196,"newsfeed, russia, vector",2
73,"newsfeed, boris johnson, united kingdom (uk)",2
95,"newsfeed, coronavirus, united kingdom (uk)",2
110,"newsfeed, europe, european medicines agency, a...",2
168,"newsfeed, narendra modi",2
54,"military & intelligence, newsfeed, joe biden, ...",2
46,"middle east, newsfeed, benjamin netanyahu, israel",2


Use the following cell to walk through one by one to see if anything stands out as odd.

In [116]:
df.loc[df.keywords==dups.iloc[8]['keywords']]

Unnamed: 0.1,Unnamed: 0,date,url,title,text,pull_time,keywords,analytics_keywords
0,0,20210430,https://sputnikglobe.com/20210430/biden-us-mil...,Biden: US Military Brass Will Decide Whether t...,"Since the onset of the COVID-19 pandemic, the ...",2025-04-12T21:46:33,"military & intelligence, newsfeed, joe biden, ...","military, news, tag_JoeBiden, organization_US_..."
1,1,20210430,https://sputnikglobe.com/20210430/russian-vacc...,Russian Vaccines Protect From All Known Strain...,"Since the onset of the COVID-19 pandemic, the ...",2025-04-12T21:46:45,"military & intelligence, newsfeed, joe biden, ...","military, news, tag_JoeBiden, organization_US_..."


In [45]:
with open('../data/sputnik_example_1.txt', mode='r', encoding='utf-8') as f:
    soup = BeautifulSoup(f.read())

Weird-looking urls: 
- [text in a box](https://sputnikglobe.com/20210825/live-updates-16-evacuees-from-afghanistan-test-positive-for-covid-19-in-india-as-delta-cases-rise-1083705747.html)
    - The boxes start at `<div class="online__item-text m-last">` and appear to be previous articles. Don't know if they should be included in text too, right now they are not included.
- [multiple texts in box](https://sputnikglobe.com/20210824/live-updates-india-covid-19-cases-rise-to-3247-million-1083697851.html)
    - same as text in a box
- [tweets](https://sputnikglobe.com/20210823/seven-members-of-dalit-family-in-india-severely-beaten-over-black-magic-allegation-1083692603.html)
    - They are contained in `<blockquote class="twitter-tweet" align="center" data-link-color="#069">`. Don't know if they should be included in text too, right now they are not included.

Additional keywords to pull
- google_tags. E.g., `<script type="text/javascript" id="" charset="">ym(google_tag_manager["rm"]["11997873"](44),"params",{tags:{1082775132:["Military \x26 Intelligence, Newsfeed, Joe Biden, US military, COVID-19, Vaccines, US soldiers, mandate"]}});</script>`