# Data Gathering

In [None]:
from config import app_id, api_key

from bs4 import BeautifulSoup as BS
import requests
import json
import time
import datetime

import pandas as pd
import numpy as np
import datetime

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from textblob import TextBlob

import matplotlib.pyplot as plt
import seaborn           as sns
sns.set_style("darkgrid")

## The New York Times: API Requests

In [None]:
def call_nyt_by_year_and_page(year, page):
    root_url   = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?q=s&n500&sort=newest'
    begin_date = '&begin_date={}0101'.format(str(year))
    end_date   = '&end_date={}1231'.format(str(year))
    pagination = '&page={}'.format(str(page))
    doc_params = '&fl=web_url&fl=snippet&fl=pub_date&fl=_id&fl=lead_paragraph'
    
    url        = root_url + begin_date + end_date + pagination + doc_params + '&api-key=' + api_key
    print(url)
    
    response   = requests.get(url)
    
    return response.json()

In [None]:
def call_nyt_by_year(year):
    annual_articles = []
    
    total_results = call_nyt_by_year_and_page(year,0)
    hits  = total_results['response']['meta']['hits']
    
    for i in range(int(hits/10)):
        query = call_nyt_by_year_and_page(year,i)
        annual_articles = annual_articles + query['response']['docs']
        time.sleep(7)
        
    return annual_articles

In [None]:
# request all pages of articles for all years
# returns a list of article dictionaries
def call_nyt_by_all_years(years):
    all_articles = []
    
    for year in years:
        annual_articles = call_nyt_by_year(year)
        all_articles = all_articles + annual_articles
        time.sleep(7)
        
    return all_articles

In [None]:
all_years    = list(range(2010,2020))
all_articles = call_nyt_by_all_years(all_years)

In [None]:
with open("nyt_api.json", "w") as write_file:
    json.dump(all_articles, write_file)
    
with open("nyt_api.json", "r") as read_file:
    data = json.load(read_file)

## The New York Times: Web Scraping

In [None]:
def get_nyt_text(url):
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    page = requests.get(url, headers=headers,timeout=5)
    page.status_code
    
    soup = BS(page.content, 'html.parser')
    content = soup.findAll('p', class_ = 'css-18icg9x evys1bk0')
    
    nyt = ''
    for index in range(len(content)):
        nyt += content[index].get_text()
        
    return nyt

In [None]:
for i in range(len(data)):
    print(i)
    print(data[i]['web_url'])
    data[i]['article'] = get_nyt_text(data[i]['web_url'])

## Natural Language Processing: VADER Sentiment Analysis

In [None]:
si = SentimentIntensityAnalyzer()

for i in range(len(data)):
    data[i]['VADER snippet neg']      = si.polarity_scores(data[i]['snippet'])['neg']
    data[i]['VADER snippet neu']      = si.polarity_scores(data[i]['snippet'])['neu']
    data[i]['VADER snippet pos']      = si.polarity_scores(data[i]['snippet'])['pos']
    data[i]['VADER snippet compound'] = si.polarity_scores(data[i]['snippet'])['compound']
    
    data[i]['VADER lead neg']      = si.polarity_scores(data[i]['lead_paragraph'])['neg']
    data[i]['VADER lead neu']      = si.polarity_scores(data[i]['lead_paragraph'])['neu']
    data[i]['VADER lead pos']      = si.polarity_scores(data[i]['lead_paragraph'])['pos']
    data[i]['VADER lead compound'] = si.polarity_scores(data[i]['lead_paragraph'])['compound']

## Natural Language Processing: TextBlob Sentiment Analysis

In [None]:
from textblob import TextBlob

for i in range(len(data)):
    data[i]['TextBlob snippet polarity']     = TextBlob(data[i]['snippet']).sentiment[0]
    data[i]['TextBlob snippet subjectivity'] = TextBlob(data[i]['snippet']).sentiment[1]
    
    data[i]['TextBlob lead polarity']        = TextBlob(data[i]['lead_paragraph']).sentiment[0]
    data[i]['TextBlob lead subjectivity']    = TextBlob(data[i]['lead_paragraph']).sentiment[1]
    
    data[i]['TextBlob article polarity']     = TextBlob(data[i]['article']).sentiment[0]
    data[i]['TextBlob article subjectivity'] = TextBlob(data[i]['article']).sentiment[1]

In [None]:
with open("nyt_api_and_articles_vader_textblob.json", "w") as write_file:
    json.dump(data, write_file)

## Cleaning, Counting and Aggregating DataFrame Records by Date

In [None]:
null_articles = 0
full_articles = 0

for i in range(len(data)):
    
    if len(data[i]['article']) == 0:
        null_articles += 1
    else:
        full_articles += 1

In [None]:
null_articles

In [None]:
full_articles

In [None]:
sentiment = pd.DataFrame(data)
sentiment['date'] = pd.to_datetime(sentiment.pub_date).dt.date
sentiment.head()

In [None]:
sentiment_clean = sentiment[sentiment['TextBlob article subjectivity'] != 0]
sentiment_clean.head()

In [None]:
sentiment_clean = sentiment_clean.groupby('date').mean()
sentiment_clean

In [None]:
counts = sentiment_clean.groupby('date').count()
counts['article count'] = counts['_id']
counts = counts[['article count']]
counts

In [None]:
sentiment_counts = sentiment_clean.merge(counts, left_on='date', right_index=True)
sentiment_counts

In [None]:
sentiment_counts.to_pickle("sentiment_counts.pkl")

## Final DataFrame with Sentiment Analysis by Date

In [None]:
sentiment_counts = pd.read_pickle("sentiment_counts.pkl")
sentiment_counts.head()

In [None]:
import matplotlib.pyplot as plt
axes = sentiment_counts.hist(figsize=(14,14))