In [1]:
import requests
import time
import pandas as pd
import json

**Part I. Data Collection**



1.1 Collection of New York Times articles with the word "anxiety" or "depression" on the headline

In [2]:
def collect_articles_NYT(word, output_file_name): 
    '''
    Interact with The New York Times API to collect articles' data. Restrict
    the search to those articles containing the given word in the headline.
    Lastly, generate a json file to store the data so it can be easily 
    retrieved later without needing to make requests every time. 

    Inputs: 
        word (str): word of interest for the query
        output_file_name (str): desired name for the json file to be created

    Outputs:
        json file
    '''

    #Basic details needed to interact with The New York Times API
    api_key = "deleted after collection of data"
    base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"

    articles = {}

    for year in range(2004,2025):

        year_data = []

        #Since the API only retrieves 10 results per query, we have to loop 
        #through different pages. The range 0-100 is set up since it is the
        #maximum number of pages that can be requested.
        for page in range(0,100):

            begindate = "{}0101".format(year)
            enddate = "{}1231".format(year)
            nyt_params = {"q": word, "fq": f'headline:{word}',
                        "begin_date": begindate, "end_date": enddate, 
                        "page": page, "api-key": api_key}
            
            response = requests.get(base_url, params= nyt_params)
            page_data = response.json()
            time.sleep(14)

            #After collecting all available data, subsequent pages retrieve
            #empty results. For that reason, we can break the loop when pages 
            #are no longer useful. It also avoids making unnecessary and 
            #time-consuming requests
        
            if len(page_data["response"]["docs"]) == 0:
                break

            year_data += page_data["response"]["docs"]

        articles[year] = year_data

    #Export the dictionary to a JSON file to save it locally to avoid needing
    #to make requests every time we work on the project. 
    with open(output_file_name, 'w') as json_file:
        json.dump(articles, json_file, indent=4)

In [3]:
#Call function to retrieve NYT articles related to anxiety and depression

#collect_articles_NYT("anxiety", "anxiety_articles_NYT.json")
#collect_articles_NYT("depresion", "depression_articles_NYT.json")

1.2 Collection of The Guardian articles with the word "anxiety" or "depression" on the headline

**Part I. Data Cleaning**