### Use newsapi to collect all pieces of news about trade war and store them in jsons

In [1]:
import requests

import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt

pd.set_option('max_colwidth',10000)

%matplotlib inline

In [None]:
# A function specifies search conditions and generates an API url
def generate_url(page,start,end):
    url = ('https://newsapi.org/v2/everything?'
       'q="trade war"&'
       'sources=["the-new-york-times","cnn","xinhua-net"]&'
       f'from={start}&'
       f'to={end}&'
       'sortBy=relevancy&'
       'pageSize=100&'
       f'page={page}&'
       'apiKey=fc8921bcf53e482ba461714a254c0d7c'
       )
    return url

In [None]:
# A function takes in start and end dates of a time period,
# and first searchs for articles within that period 
# based on the condition specified in the API url,
# then writes the first 1000 articles of the searching result
# into a csv file.
def write_result(start,end):
    
    # Collect information about each article returned by the search
    # Store the information in a list of jsons
    result = []
    for i in range(1,11):
        response = requests.get(generate_url(i,start,end))
        result.extend(response.json()['articles'])       

    # Transport critical information of the article to a list of dictionaries
    col = ['title','description','content','url','publishedAt']
    df_dict = []
    for article in result:
        temp = {}
        for c in col:
            try: 
                temp[c] = article[c]
            except:
                temp[c] = np.nan
        temp['source'] = article['source']['name']
        df_dict.append(temp)
    
    # Convert the list of dicts into a pandas DataFrame
    df = pd.DataFrame(df_dict,columns=col.append('source'))
    
    # Write the DataFrame into a csv file
    st = ''.join(start.split('-')[1:])
    en = ''.join(end.split('-')[1:])
    df.to_csv(f'data/article_info({st}-{en}).csv')
    
    return df

In [None]:
df1 = write_result('2018-11-04','2018-11-08')
df2 = write_result('2018-10-30','2018-11-03')
df3 = write_result('2018-10-25','2018-10-29')
df4 = write_result('2018-10-20','2018-10-24')
df5 = write_result('2018-10-15','2018-10-19')
df6 = write_result('2018-10-10','2018-10-14')

In [None]:
df_list = [df1,df2,df3,df4,df5,df6]
df=pd.concat(df_list,ignore_index=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.fillna('',inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.to_csv('data/article_info_6000.csv')