In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
import os

# choose a local (colab) directory to store the data.
local_root_path = os.path.expanduser("~/data")
try:
  os.makedirs(local_root_path)
except: pass

def ListFolder(google_drive_id, destination):
  file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % google_drive_id}).GetList()
  counter = 0
  for f in file_list:
    # If it is a directory then, create the dicrectory and upload the file inside it
    if f['mimeType']=='application/vnd.google-apps.folder': 
      folder_path = os.path.join(destination, f['title'])
      os.makedirs(folder_path)
      print('creating directory {}'.format(folder_path))
      ListFolder(f['id'], folder_path)
    else:
      fname = os.path.join(destination, f['title'])
      f_ = drive.CreateFile({'id': f['id']})
      f_.GetContentFile(fname)
      counter += 1
  print('{} files were uploaded in {}'.format(counter, destination))

ListFolder("1A2XBt4MTMmEmgMCmKEuJQCh1HaJGmkSa", local_root_path)

2 files were uploaded in /root/data


In [0]:
from requests import get
import json
import re 
from textblob import TextBlob
import datetime
import numpy as np
import pandas as pd

In [0]:
# perform request
def perform_request(year, month):
    api_key = 's6J10zzMfa9Vm5q1AufUu4gTovcVJEmo'
    req = 'https://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}'
    req = req.format(str(year), str(month), api_key)
    # return response
    return get(req)

In [0]:
# match keywords with article id
def match_id_to_keyword(response):
    id_to_keyword = {}
    keywords = []
    for article in response.json()["response"]["docs"]:
        id = article["_id"]
        for keyword_data in article["keywords"]:
            keywords.append(keyword_data["value"].lower())
        id_to_keyword[id] = keywords
        keywords = []
    return id_to_keyword

In [0]:
# find articles which contain keyword
def get_article_id(queue, id_to_keyword):
    id_with_keyword = []
    for id, keywords in id_to_keyword.items():
        for keyword in keywords:
            if str(queue) in keyword:
                id_with_keyword.append(id)
    return id_with_keyword

In [0]:
# get article information for article_id
def get_article_information_for_queue(queue, response, id_to_keyword):
    ids = get_article_id(str(queue), id_to_keyword)
    id_to_date = {}
    id_to_snippet = {}
    id_to_headline = {}
    for article in response.json()["response"]["docs"]:
        for id in ids:
            if str(id) == str(article["_id"]):
                id_to_date[id] = article["pub_date"]
                id_to_snippet[id] = article["snippet"]
                id_to_headline[id] = article["headline"]["main"]
                break
    return (id_to_date, id_to_snippet, id_to_headline)

In [0]:
def clean_text(text):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split()) 

In [0]:
def get_sentiment(text):
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_text(text)) 
    # set sentiment 
    return analysis.sentiment.polarity

In [0]:
# perform sentiment analysis for headline snippets
def get_sentiments(id_to_snippet):
    id_to_sentiment = {}
    for id, snippet in id_to_snippet.items():
        id_to_sentiment[id] = get_sentiment(snippet)
    return id_to_sentiment

In [14]:
import time
from tqdm import tqdm

year = 1980
month = 11
# For each row add sentiment
df = pd.read_csv("/root/data/DJI.csv")
# initial sentiment to 0
for index, row in df.iterrows():
    df.at[index, 'Sentiment'] = 0
    
# add sentiment to dates where news exist
for index, row in df.iterrows():
    try:
        year_df = int(row['Date'][:4])
        month_df = int(row['Date'][5:7])
        day_df = int(row['Date'][8:10])

        # only perform request when params changed
        if year_df > year or month_df > month:
            year = year_df
            month = month_df
            response = perform_request(year, month)
            id_to_keyword = match_id_to_keyword(response)
            (id_to_date, id_to_snippet, id_to_headline) = get_article_information_for_queue(
                "dow jones", response, id_to_keyword)
            id_to_sentiment = get_sentiments(id_to_snippet)
            print('did request')
            print("iteration year: " + str(year) + " month: " + str(month))

        # add sentiment to df when news exist
        for id, date in id_to_date.items():
            if day_df == int(date[8:10]):
                df.at[index, 'Sentiment'] = id_to_sentiment[id]
                print("sentiment changed")
    
    # catching error which occurs due to too many requests
    except KeyError:

        try:
          # wait 1 minute to do the request again due to request limit of api
          time.sleep(60)

          # do the request again after 1 minute
          # no checking for year required because error only occurs when request happens
          response = perform_request(year, month)
          print(response)
          id_to_keyword = match_id_to_keyword(response)
          (id_to_date, id_to_snippet, id_to_headline) = get_article_information_for_queue(
              "dow jones", response, id_to_keyword)
          id_to_sentiment = get_sentiments(id_to_snippet)
          print('did request')
          print("iteration year: " + str(year) + " month: " + str(month))

          # add sentiment to df when news exist
          for id, date in id_to_date.items():
              if day_df == int(date[8:10]):
                  df.at[index, 'Sentiment'] = id_to_sentiment[id]
                  print("sentiment changed")

        # catch all other errors which occur inside the KeyError
        except:
            pass

    # catch all other errors
    except:
        pass


<Response [200]>
did request
iteration year: 1985 month: 1
sentiment changed
sentiment changed
did request
iteration year: 1985 month: 2
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
did request
iteration year: 1985 month: 3
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
did request
iteration year: 1985 month: 4
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
sentiment changed
did request
iteration year: 1985 month: 5
sentiment changed
sentiment changed

In [0]:
df.to_csv('./dow_jones_sentiments.csv')