In [19]:
import json
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
from pymongo import MongoClient

## Dumping NYT's articles with main context and their metadata

In [2]:
def removekey(article_metadata):
    keys_to_remove = ["multimedia", "lead_paragraph","_id"]
    for key in keys_to_remove:
        del article_metadata[key]

In [3]:
# get id and text for filtered articles 
with open('feb_dict.json','r') as json_file:
    feb_text_filtered = json.load(json_file)

In [4]:
# get ids and make it into int for indexing
feb_id_keys = list(feb_text_filtered.keys())
feb_article_ids = list(map(int, feb_id_keys)) 

In [5]:
# get metadata for articles
with open('NYT_feb_data.txt','r') as json_file:
    feb_medadata = json.load(json_file)

In [6]:
# filter metadata using filtered text's keys
feb_metadata_filtered = [feb_medadata[i] for i in feb_article_ids]

In [7]:
# adding article_indexing to each metadata dict
for article_meta in list(enumerate(feb_metadata_filtered)): 
    removekey(article_meta[1])
    pos = article_meta[0]
    article_meta[1].update({"article_index": feb_article_ids[pos]})

In [8]:
# fixing article's text dict, which will have article_index and main_content as keys
feb_text_list = []
for key, values in feb_text_filtered.items():
    raw_keys = ['article_index', 'main_content']
    raw_vals = [key, values]
    item_dict = {raw_keys[i]: raw_vals[i] for i in range(len(raw_keys))}
    feb_text_list.append(item_dict)

In [9]:
# get id and text for filtered articles 
with open('march_dict.json','r') as json_file:
    march_text_filtered = json.load(json_file)

In [10]:
# get ids and make it into int for indexing
march_id_keys = list(march_text_filtered.keys())
march_article_ids = list(map(int, march_id_keys)) 

In [11]:
# get metadata for articles
with open('NYT_march_data_2.txt','r') as json_file:
    march_medadata = json.load(json_file)

In [12]:
# filter metadata using filtered text's keys
march_metadata_filtered = [march_medadata[i] for i in march_article_ids]

In [14]:
# adding article_indexing to each metadata dict
for article_meta in list(enumerate(march_metadata_filtered)): 
    removekey(article_meta[1])
    pos = article_meta[0]
    article_meta[1].update({"article_index": march_article_ids[pos]})

In [15]:
# fixing article's text dict, which will have article_index and main_content as keys
march_text_list = []
for key, values in march_text_filtered.items():
    raw_keys = ['article_index', 'main_content']
    raw_vals = [key, values]
    item_dict = {raw_keys[i]: raw_vals[i] for i in range(len(raw_keys))}
    march_text_list.append(item_dict)

In [16]:
# combine lists of metadata and raw text for articles
metadata_all = feb_metadata_filtered+march_metadata_filtered
text_all = feb_text_list+march_text_list

In [17]:
# Connect to the MongoDB and create database "DDR-ML-Final"
# each article's main content goes to collection "article_main_content"
client = MongoClient('localhost', 27017)
db = client['DDR-ML-Final']
col_main = db['article_main_content']
col_main.insert_many(text_all)

<pymongo.results.InsertManyResult at 0x7ffb78826dc0>

In [18]:
# each article's metadata goes to collection "article_metadata"
col_meta = db['article_metadata']
col_meta.insert_many(metadata_all)

<pymongo.results.InsertManyResult at 0x7ffb6a4886e0>

## WebScraping Delegates data and Dump into DB

In [20]:
## Navigate to each url, & save article information
headers = {'user-agent':'Mozilla/5.0'}
url = 'https://www.nytimes.com/interactive/2020/us/elections/delegate-count-primary-results.html'
response = requests.get(url, headers)

# save the html file
with open('delegate_counts.htm', 'w') as file:
    file.write(response.text)
    file.close

In [21]:
#open the file and parse to soup object
with open('delegate_counts.htm','r') as file:
    soup = BeautifulSoup(file)

In [22]:
#get only the rows of the table, so you can extract data
table_rows = soup.find_all("tr", class_ ="g-event")

In [23]:
### this code block will result in an error, but it captures all the data we need so that's fine
states_list = []
biden_delegates_list = []
sanders_delegates_list = []
biden_wins_list = []
sanders_wins_list = []

for row in table_rows:
    soup = BeautifulSoup(str(row))
    
    #get the state and add to list
    state = soup.find("span", class_="g-full-name").string
    states_list.append(state)
    
    #set each value to null at the beginning of each run of the lopp
    biden_delegates_nonwinner = np.nan
    biden_delegates_winner = np.nan
    sanders_delegates_nonwinner = np.nan
    sanders_delegates_winner = np.nan
    
    #if Biden did not win, get delegate count from table row
    try:
        biden_delegates_nonwinner = int(soup.find("td", class_="g-cand-wide g-cand g-biden in").string)
    #if that table row is not present, it means Biden won and need to get value as sibling of checkmark image
    except AttributeError:
        biden_delegates_winner = int(soup.find("img", class_="g-checkmark").next_sibling)
    #if biden_delegates_winner is > 0 (aka not null), it means Biden won the state so use that delegate value
    if biden_delegates_winner > 0:
        biden_delegates = biden_delegates_winner
        biden_winner = 1
    #if Biden lost the state, use biden_delegates_nonwinner value
    else:
        biden_delegates = biden_delegates_nonwinner
        biden_winner = 0
    #add delegate count & whether Biden won to list
    biden_delegates_list.append(biden_delegates)
    biden_wins_list.append(biden_winner)
    
    #if Sanders did not win, get delegate count from table row
    try:
        sanders_delegates_nonwinner = int(soup.find("td", class_="g-cand-wide g-cand g-sanders in").string)
    #if that table row is not present, it means Sanders won and need to get value as sibling of checkmark image
    except AttributeError:
        sanders_delegates_winner = int(soup.find("img", class_="g-checkmark").next_sibling)
    #if sanders_delegates_winner is > 0 (aka not null), it means Sanders won the state so use that delegate value
    if sanders_delegates_winner > 0:
        sanders_delegates = sanders_delegates_winner
        sanders_winner = 1
    #if Sanders lost the state, use sanders_delegates_nonwinner value
    else:
        sanders_delegates = sanders_delegates_nonwinner
        sanders_winner = 0
    #add delegate count & whether Sanders won to list
    sanders_delegates_list.append(sanders_delegates)
    sanders_wins_list.append(sanders_winner)
    
    

AttributeError: 'NoneType' object has no attribute 'string'

In [24]:
dictionary = {'state':states_list, \
              'biden_delegates':biden_delegates_list, \
              'sanders_delegates':sanders_delegates_list, \
              'biden_win':biden_wins_list, \
              'sanders_win':sanders_wins_list}

In [25]:
df = pd.DataFrame(dictionary)

In [27]:
delegates_records = df.to_dict('records')

In [29]:
col_delegates = db['delegates']
col_delegates.insert_many(delegates_records)

<pymongo.results.InsertManyResult at 0x7ffb6b5e7eb0>

## Funding info for Bernie and Joe

In [30]:
# get funding info
with open('finances.json','r') as json_funding:
    funding = json.load(json_funding)

In [33]:
col_funding = db['fec Filings']
col_funding.insert_many(funding)

<pymongo.results.InsertManyResult at 0x7ffa0828bf00>