In [1]:
#set library
import requests
from dotenv import load_dotenv
import pandas as pd
import os
import time
import numpy as np
#from bs4 import BeautifulSoup

#### Main API Query Code

In [2]:
#read in file
members = pd.read_csv("../data/officer_names.csv")

In [3]:
#format year for api search
members["year"] = members["year"].astype(str).str[:4] + "0101"

In [4]:
#load api key
load_dotenv()
api_key = os.environ.get('georgetown_api_key')

In [5]:
# set url for api
base_url =  "https://api.nytimes.com/svc/search/v2/articlesearch.json"

In [29]:
storage = []

In [30]:
offset = 3647

In [31]:
for row in range(offset, len(members)):
    #setting parameters
    search = members["officer_names_lower"].iloc[row] + " AND (police OR officer OR NYPD) AND (misconduct OR force OR brutality OR violence)"
    start = int(members["year"].iloc[row]) - 10000
    end = int(members["year"].iloc[row]) + 10000
        
    #request
    response = requests.get(base_url, params= {
        "q": search, 
        "api-key": api_key,
        "begin_date": str(start),  
        "end_date": str(end)
    })
    
    #checking status
    if response.status_code == 200:
        storage.append(response.json()["response"])
    elif response.status_code == 429:
        print("Request Failed:", response.status_code)
        break
    else:
        print("Request Failed:", response.status_code)
              
    #set sleep time so we don't have issues with API rate limits
    time.sleep(22) 

In [32]:
len(storage) #currently at 500

353

In [33]:
def processing_json(storage, members, i):
    """
    extracts info from .json and converts
    it into dataframe
    """
    
    #extracting info
    name = members["officer_names_lower"][i] #name of officer
    year = members["year"][i][:4]
    num = storage[i]["meta"]["hits"] #gets number of articles that matched
    info = storage[i]["docs"]
    
    if num == 0 or not info:  
        return pd.DataFrame([{
            "officer_names_lower": name,
            "year": year,
            "news_hits": num,
            "info": None  
        }])
    
    #dumping into dict
    temp = {
        "officer_names_lower": name,
        "year": year,
        "news_hits": num,
        "info": info
    }
    
    #convert to pandas
    news_results = pd.DataFrame(temp)
    
    return news_results

In [34]:
#empty dataframe to store results
nyt_results = pd.DataFrame(columns = ['officer_names_lower', 'year', 'news_hits', 'info'])

In [35]:
#loop to preprocess
for i in range(len(storage)):
    temp = processing_json(storage, members, i)
    nyt_results = pd.concat([nyt_results, temp])
    

In [36]:
#checks count
nyt_results["news_hits"].value_counts()

news_hits
0    348
1      5
Name: count, dtype: int64

In [37]:
#save file
nyt_results.to_csv("../data/nyt_results_p7.csv", index = False)

#### Concat Searches

In [38]:
#read files
search1 = pd.read_csv("../data/nyt_results.csv")
search2 = pd.read_csv("../data/nyt_results_p2.csv")
search3 = pd.read_csv("../data/nyt_results_p3.csv")
search4 = pd.read_csv("../data/nyt_results_p4.csv")
search5 = pd.read_csv("../data/nyt_results_p5.csv")
search6 = pd.read_csv("../data/nyt_results_p6.csv")
search7 = pd.read_csv("../data/nyt_results_p7.csv")

In [39]:
#clean file
search1 = search1.drop(columns = ["info"]).drop_duplicates()
search2 = search2.drop(columns = ["info"]).drop_duplicates()
search3 = search3.drop(columns = ["info"]).drop_duplicates()
search4 = search4.drop(columns = ["info"]).drop_duplicates()
search5 = search5.drop(columns = ["info"]).drop_duplicates()
search6 = search6.drop(columns = ["info"]).drop_duplicates()
search7 = search7.drop(columns = ["info"]).drop_duplicates()

In [40]:
#combine results
full_nyt_results =  pd.concat([search1, search2, search3, search4, search5, search6, search7], axis=0, ignore_index=True)

In [42]:
full_nyt_results.shape

(4000, 3)

In [43]:
#fix errors from preprocessing (didn't account for the batching)
full_nyt_results["officer_names_lower"] = members["officer_names_lower"][:len(full_nyt_results)]
full_nyt_results["year"] = members["year"][:len(full_nyt_results)].str[:4]

In [44]:
#checks count
full_nyt_results["news_hits"].value_counts()

news_hits
0    3936
1      51
2       9
4       3
3       1
Name: count, dtype: int64

In [46]:
#save file
full_nyt_results.to_csv("../data/nyt_api_results.csv", index = False)