In [4]:
import spacy
import pandas as pd

import json
import spacy
import requests
import random
import time

import spacy
import tqdm
from collections import Counter

import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.gensim

# Research Topic

In the aftermath of Barack Obama's inauguration on January 20, 2009, the initial years of his presidential term were characterized by a prevailing positive sentiment in U.S.-China relations. During this period, media portrayals of China generally leaned toward the favorable, reflecting a climate of cooperation and mutual understanding between the two nations.

However, a palpable shift in diplomatic dynamics occurred with the transition to the Trump administration. The advent of the Trade War and the multifaceted challenges posed by the Covid-19 pandemic introduced a discernible transformation in how China was depicted in the media. This period marked an evolution in U.S.-China relations, portraying a landscape that was not only complex but at times contentious under President Trump's leadership.

Following the conclusion of Trump's presidency, Joe Biden assumed office with a diplomatic approach that aimed to soften relations with China. Despite these efforts, the present geopolitical landscape positions China and the U.S. on opposing sides across various critical issues. The intricate interplay of economic, political, and global challenges has contributed to a complex and multifaceted relationship between the two nations.

Against this backdrop, our research project seeks to undertake a comprehensive exploration of the nuanced evolution of media representations of China-U.S. relations. Our investigation spans from the early years of the Obama administration in 2009 to the present day. 

The choice of The New York Times as a primary source is deliberate, considering its rich history dating back to 1851 and its consistent adherence to high editorial standards, as evidenced by the numerous Pulitzer Prizes it has earned for reporting excellence. Moreover, its reputation as a reliable and influential source positions it as a noteworthy contributor to the discourse on global affairs. The newspaper's comprehensive coverage and global reach provide a broad and diverse perspective, enriching our understanding of the complex dynamics between China and the U.S. The New York Times' adaptability to the digital era, coupled with its significant online presence, ensures that its content is accessible to a vast and diverse global audience. This digital accessibility enhances our ability to capture a wide range of viewpoints and narratives that have shaped perceptions of the China-U.S. relationship over time. Furthermore, by recognizing The New York Times' center-left editorial stance, we affirm our commitment to acknowledge potential bias, while appreciating the newspaper's unique insights. 

In our research, our goal is to provide valuable insights into the reciprocal relationship between media portrayals, particularly from influential sources such as The New York Times, and the dynamic nature of U.S.-China relations. We have chosen to look for articles containing the keyword: "U.S.-China relations" due to various reasons. First, the NYT refers to the United States of America as "the U.S." in most of its articles, therefore, using this wording provides us with more opportunity to come across relevant articles. Second, as the NYT is an american journal, when refering to the relationship between the U.S. and China, the U.S. is always mentioned first to appeal to the american public. As such, using the keyword "U.S.-China relations" will allow us to gather all articles useful to our research. We downloaded 500 articles from each year, and created a data of 3000 articles to asses the evolution of the depiction of U.S.-China relations on American media.

Our aim is to capture the diverse narratives and perspectives that have molded our understanding of this pivotal geopolitical relationship over time. Through this exploration, we strive to contribute insights into how media portrayals, especially from influential sources like The New York Times, have not only mirrored but also influenced the evolving nature of China-U.S. relations. This endeavor offers a valuable lens through which to comprehend the intricate complexities of this vital global partnership. Our approach remains thoughtful and inclusive as we navigate the complexities of this global partnership, recognizing the multifaceted nature of media influence. Through this exploration, we aim to provide a thorough and valuable perspective on the evolving narratives surrounding China-U.S. relations, adding depth and context to our research endeavor.

In [5]:
my_api_key="q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH"

# 2009-2010

In [7]:
def number_of_articles(api_key, keyword, begin_date="20090120", end_date="20100120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relation'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relation&begin_date=20090120&end_date=20100120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
19689


In [8]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20090120", end_date="20100120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20090120&end_date=20100120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20090120&end_date=20100120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20090120&end_date=20100120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20090120&end_date=20100120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20090120&end_date=20100120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20090120&end_date=20100120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [12]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_1 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_1

corpus_1  = consolidate_corpus(articles)

corpus_1 ['date'] = pd.to_datetime(corpus_1['date'])

corpus_1 = corpus_1.sort_values(by='date')

corpus_1 .to_csv('corpus_1.csv', index=False)

# 2010 - 2011

In [14]:
def number_of_articles(api_key, keyword, begin_date="20100120", end_date="20110120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relation'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relation&begin_date=20100120&end_date=20110120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
12852


In [20]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20100120", end_date="20110120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20100120&end_date=20110120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20100120&end_date=20110120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20100120&end_date=20110120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20100120&end_date=20110120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20100120&end_date=20110120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20100120&end_date=20110120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [22]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_2 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_2

corpus_2 = consolidate_corpus(articles)

corpus_2 ['date'] = pd.to_datetime(corpus_2['date'])

corpus_2  = corpus_2.sort_values(by='date')

corpus_2.to_csv('corpus_2.csv', index=False)

# 2011 - 2012

In [6]:
def number_of_articles(api_key, keyword, begin_date="20110120", end_date="20120120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relation'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relation&begin_date=20110120&end_date=20120120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
11652


In [7]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20110120", end_date="20120120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20110120&end_date=20120120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20110120&end_date=20120120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20110120&end_date=20120120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20110120&end_date=20120120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20110120&end_date=20120120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20110120&end_date=20120120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [8]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_3 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_3

corpus_3  = consolidate_corpus(articles)

corpus_3['date'] = pd.to_datetime(corpus_3['date'])

corpus_3 = corpus_3.sort_values(by='date')

corpus_3.to_csv('corpus_3.csv', index=False)

# 2012 - 2013

In [9]:
def number_of_articles(api_key, keyword, begin_date="20120120", end_date="20130120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20120120&end_date=20130120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
14054


In [10]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20120120", end_date="20130120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20120120&end_date=20130120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20120120&end_date=20130120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20120120&end_date=20130120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20120120&end_date=20130120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20120120&end_date=20130120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20120120&end_date=20130120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [11]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_4 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_4

corpus_4 = consolidate_corpus(articles)

corpus_4 ['date'] = pd.to_datetime(corpus_4['date'])

corpus_4  = corpus_4.sort_values(by='date')

corpus_4.to_csv('corpus_4.csv', index=False)

# 2013 - 2014

In [12]:
def number_of_articles(api_key, keyword, begin_date="20130120", end_date="20140120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20130120&end_date=20140120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
12646


In [13]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20130120", end_date="20140120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20130120&end_date=20140120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20130120&end_date=20140120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20130120&end_date=20140120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20130120&end_date=20140120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20130120&end_date=20140120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20130120&end_date=20140120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [14]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_5 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_5

corpus_5 = consolidate_corpus(articles)

corpus_5 ['date'] = pd.to_datetime(corpus_5['date'])

corpus_5  = corpus_5.sort_values(by='date')

corpus_5.to_csv('corpus_5.csv', index=False)

# 2014 - 2015

In [15]:
def number_of_articles(api_key, keyword, begin_date="20140120", end_date="20150120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20140120&end_date=20150120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
14604


In [17]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20140120", end_date="20150120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20140120&end_date=20150120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20140120&end_date=20150120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20140120&end_date=20150120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20140120&end_date=20150120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20140120&end_date=20150120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20140120&end_date=20150120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [18]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_6 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_6

corpus_6 = consolidate_corpus(articles)

corpus_6 ['date'] = pd.to_datetime(corpus_6['date'])

corpus_6  = corpus_6.sort_values(by='date')

corpus_6.to_csv('corpus_6.csv', index=False)

# 2015 - 2016

In [19]:
def number_of_articles(api_key, keyword, begin_date="20150120", end_date="20160120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20150120&end_date=20160120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
13704


In [20]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20150120", end_date="20160120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20150120&end_date=20160120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20150120&end_date=20160120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20150120&end_date=20160120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20150120&end_date=20160120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20150120&end_date=20160120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20150120&end_date=20160120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [21]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_7 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_7

corpus_7 = consolidate_corpus(articles)

corpus_7 ['date'] = pd.to_datetime(corpus_7['date'])

corpus_7  = corpus_7.sort_values(by='date')

corpus_7.to_csv('corpus_7.csv', index=False)

# 2016 - 2017

In [32]:
def number_of_articles(api_key, keyword, begin_date="20160120", end_date="20170120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20160120&end_date=20170120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
10538


In [33]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20160120", end_date="20170120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20160120&end_date=20170120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20160120&end_date=20170120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20160120&end_date=20170120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20160120&end_date=20170120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20160120&end_date=20170120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20160120&end_date=20170120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [34]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_8 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_8

corpus_8 = consolidate_corpus(articles)

corpus_8 ['date'] = pd.to_datetime(corpus_8['date'])

corpus_8  = corpus_8.sort_values(by='date')

corpus_8.to_csv('corpus_8.csv', index=False)

# 2017 - 2018

In [35]:
def number_of_articles(api_key, keyword, begin_date="20170120", end_date="20180120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20170120&end_date=20180120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
11121


In [36]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20170120", end_date="20180120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20170120&end_date=20180120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20170120&end_date=20180120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20170120&end_date=20180120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20170120&end_date=20180120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20170120&end_date=20180120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20170120&end_date=20180120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [37]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_9 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_9

corpus_9 = consolidate_corpus(articles)

corpus_9 ['date'] = pd.to_datetime(corpus_9['date'])

corpus_9  = corpus_9.sort_values(by='date')

corpus_9.to_csv('corpus_9.csv', index=False)

# 2018 - 2019

In [38]:
def number_of_articles(api_key, keyword, begin_date="20180120", end_date="20190120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20180120&end_date=20190120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
12502


In [41]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20180120", end_date="20190120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20180120&end_date=20190120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20180120&end_date=20190120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20180120&end_date=20190120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20180120&end_date=20190120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20180120&end_date=20190120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20180120&end_date=20190120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [42]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_10 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_10

corpus_10 = consolidate_corpus(articles)

corpus_10 ['date'] = pd.to_datetime(corpus_10['date'])

corpus_10  = corpus_10.sort_values(by='date')

corpus_10.to_csv('corpus_10.csv', index=False)

# 2019 - 2020

In [43]:
def number_of_articles(api_key, keyword, begin_date="20190120", end_date="20200120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20190120&end_date=20200120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
11802


In [44]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20190120", end_date="20200120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20190120&end_date=20200120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20190120&end_date=20200120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20190120&end_date=20200120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20190120&end_date=20200120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20190120&end_date=20200120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20190120&end_date=20200120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [45]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_11 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_11

corpus_11 = consolidate_corpus(articles)

corpus_11 ['date'] = pd.to_datetime(corpus_11['date'])

corpus_11  = corpus_11.sort_values(by='date')

corpus_11.to_csv('corpus_11.csv', index=False)

# 2020 -2021

In [46]:
def number_of_articles(api_key, keyword, begin_date="20200120", end_date="20210120"):
    url_query = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?fq={keyword}&begin_date={begin_date}&end_date={end_date}&api-key={api_key}"
    print(url_query)
    res = requests.get(url_query)
    json_res = res.json()
    hits = json_res["response"]["meta"]["hits"]
    return hits

print(number_of_articles(api_key=my_api_key, keyword='U.S.-China relations'))

https://api.nytimes.com/svc/search/v2/articlesearch.json?fq=U.S.-China relations&begin_date=20200120&end_date=20210120&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
16212


In [47]:
def search_nyt_articles(my_api_key, keyword, max_pages, begin_date="20200120", end_date="20210120"):
    docs = []

    for i in range(max_pages):
        api = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?q={keyword}&begin_date={begin_date}&end_date={end_date}&page={i+1}&api-key={my_api_key}"
        print(api)
        response = requests.get(api)
        data = response.json()
        try:
            docs.extend(data['response']['docs'])
        except KeyError:
            return docs
        time.sleep(12)

    return docs

articles = search_nyt_articles(my_api_key, keyword='U.S.-China relations', max_pages=50)

https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20200120&end_date=20210120&page=1&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20200120&end_date=20210120&page=2&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20200120&end_date=20210120&page=3&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20200120&end_date=20210120&page=4&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20200120&end_date=20210120&page=5&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=U.S.-China relations&begin_date=20200120&end_date=20210120&page=6&api-key=q9G4oWuPHcFvBfj0rkOMyGfqgL0AOEyH
http

In [48]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        corpus_12 = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return corpus_12

corpus_12 = consolidate_corpus(articles)

corpus_12 ['date'] = pd.to_datetime(corpus_12['date'])

corpus_12  = corpus_12.sort_values(by='date')

corpus_12.to_csv('corpus_12.csv', index=False)

# Create final dataframe df

In [49]:
df1 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_1.csv")

In [50]:
df2 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_2.csv")

In [51]:
df3 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_3.csv")

In [52]:
df4 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_4.csv")

In [53]:
df5 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_5.csv")

In [54]:
df6 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_6.csv")

In [55]:
df7 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_7.csv")

In [56]:
df8 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_8.csv")

In [57]:
df9 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_9.csv")

In [58]:
df10 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_10.csv")

In [59]:
df11 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_11.csv")

In [60]:
df12 = pd.read_csv("/Users/defneulusoy/Desktop/Python Code/final_project/corpus_12.csv")

In [61]:
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12], ignore_index=True)

In [62]:
len(df)

4790

In [63]:
df

Unnamed: 0,headline,lead_paragraph,date
0,"China hardens rhetoric on Tibet, Taiwan and U.S.",BEIJING — The Chinese government announced Tue...,2009-01-20 05:00:00+00:00
1,China sees threats from separatists and U.S. a...,BEIJING — The Chinese government announced Tue...,2009-01-20 05:00:00+00:00
2,China Sees Separatist Threats,BEIJING  China said Tuesday that it faces thr...,2009-01-20 11:33:57+00:00
3,Europe's divisions show through as it welcomes...,BRUSSELS — Since well before the inauguration ...,2009-01-21 05:00:00+00:00
4,Obama clings to his BlackBerry,WASHINGTON — President Barack Obama has overru...,2009-01-22 05:00:00+00:00
...,...,...,...
4785,The U.S. calls China’s repression of the Uighu...,The State Department declared on Tuesday that ...,2021-01-19 17:35:49+00:00
4786,Trump Bequeaths Biden an Upended World,PARIS — Most countries lost patience long ago....,2021-01-19 17:36:58+00:00
4787,"In Confirmation Hearings, Biden Aides Indicate...",WASHINGTON — President-elect Joseph R. Biden J...,2021-01-20 00:30:36+00:00
4788,"China’s Oppression of Muslims in Xinjiang, Exp...","On the final full day of the Trump presidency,...",2021-01-20 09:52:02+00:00


In [64]:
def consolidate_corpus(articles):
    docs = []
    for article in articles:
        lead_paragraph = article.get('lead_paragraph')
        docs.append([article['headline']['main'], lead_paragraph, article['pub_date']])
        df = pd.DataFrame(docs, columns=['headline', 'lead_paragraph', 'date'])
    return df

df = consolidate_corpus(articles)

df ['date'] = pd.to_datetime(df['date'])

df  = df.sort_values(by='date')

df.to_csv('df.csv', index=False)