### Use newsapi to collect all pieces of news about trade war and store them in jsons

In [107]:
import requests

import pandas as pd
import numpy as np
import pprint
import matplotlib.pyplot as plt

pd.set_option('max_colwidth',10000)

%matplotlib inline

In [34]:
# A function specifies search conditions and generates an API url
def generate_url(page,start,end):
    url = ('https://newsapi.org/v2/everything?'
       'q="trade war"&'
       'sources=["the-new-york-times","cnn","xinhua-net"]&'
       f'from={start}&'
       f'to={end}&'
       'sortBy=relevancy&'
       'pageSize=100&'
       f'page={page}&'
       'apiKey=fc8921bcf53e482ba461714a254c0d7c'
       )
    return url

In [37]:
# A function takes in start and end dates of a time period,
# and first searchs for articles within that period 
# based on the condition specified in the API url,
# then writes the first 1000 articles of the searching result
# into a csv file.
def write_result(start,end):
    
    # Collect information about each article returned by the search
    # Store the information in a list of jsons
    result = []
    for i in range(1,11):
        response = requests.get(generate_url(i,start,end))
        result.extend(response.json()['articles'])       

    # Transport critical information of the article to a list of dictionaries
    col = ['title','description','content','url','publishedAt']
    df_dict = []
    for article in result:
        temp = {}
        for c in col:
            try: 
                temp[c] = article[c]
            except:
                temp[c] = np.nan
        temp['source'] = article['source']['name']
        df_dict.append(temp)
    
    # Convert the list of dicts into a pandas DataFrame
    df = pd.DataFrame(df_dict,columns=col.append('source'))
    
    # Write the DataFrame into a csv file
    st = ''.join(start.split('-')[1:])
    en = ''.join(end.split('-')[1:])
    df.to_csv(f'data/article_info({st}-{en}).csv')
    
    return df

In [38]:
df1 = write_result('2018-11-04','2018-11-08')
df2 = write_result('2018-10-30','2018-11-03')
df3 = write_result('2018-10-25','2018-10-29')
df4 = write_result('2018-10-20','2018-10-24')
df5 = write_result('2018-10-15','2018-10-19')
df6 = write_result('2018-10-10','2018-10-14')

In [44]:
df_list = [df1,df2,df3,df4,df5,df6]
df=pd.concat(df_list,ignore_index=True)

In [45]:
df.shape

(6000, 6)

In [48]:
df.head()

Unnamed: 0,content,description,publishedAt,source,title,url
0,"SHANGHAI/BEIJING (Reuters) - Trade frictions with the United States and accusations of industrial espionage are set to cast a cloud over China’s largest aerospace meeting this week, as suppliers consider what the country’s slowing economy could mean for boomi… [+4558 chars]","Trade frictions with the United States and accusations of industrial espionage are set to cast a cloud over China's largest aerospace meeting this week, as suppliers consider what the country's slowing economy could mean for booming jet demand.",2018-11-04T03:58:49Z,Reuters,"Trade war, spy claims cloud horizon for China airshow",https://www.reuters.com/article/us-china-airshow-preview/trade-war-spy-claims-cloud-horizon-for-china-airshow-idUSKCN1N9026
1,"ZHUHAI, China (Reuters) - Europe’s Airbus SE indicated on Tuesday that it did not expect a sales windfall from trade tensions between China and the United States, with the manufacturer’s China head saying there would be “no winner” from a prolonged economic c… [+519 chars]","Europe's Airbus SE indicated on Tuesday that it did not expect a sales windfall from trade tensions between China and the United States, with the manufacturer's China head saying there would be ""no winner"" from a prolonged economic conflict.",2018-11-06T06:06:33Z,Reuters,"Airbus China CEO says ""no winner"" from U.S.-China trade war",https://www.reuters.com/article/us-china-airshow-airbus/airbus-china-ceo-says-no-winner-from-u-s-china-trade-war-idUSKCN1NB0GP
2,"SYDNEY (Reuters) - Asian stocks sank on Monday as fears of faster rate hikes in the United States and uncertainty around the Sino-U.S. trade war deterred investment in riskier assets, while sterling jumped to a two-week high on hopes of an orderly Brexit. Mar… [+4073 chars]","Asian stocks sank on Monday as fears of faster rate hikes in the United States and uncertainty around the Sino-U.S. trade war deterred investment in riskier assets, while sterling jumped to a two-week high on hopes of an orderly Brexit.",2018-11-05T04:51:58Z,Reuters,"Asian shares sink as risk sentiment sours, pound hits 2-week top",https://www.reuters.com/article/us-global-markets/asian-shares-sink-as-risk-sentiment-sours-pound-hits-2-week-top-idUSKCN1N90PQ
3,"(Bloomberg) -- U.S. equity futures jumped, the dollar dropped and Treasuries climbed as investors seemed to cheer on an outlook for political gridlock in the wake of the American midterm elections. Risky assets were in favor after results showed Democrats win… [+4044 chars]",The biggest macro theme remains the trade war after recent warnings from major names including the IMF’s Christine Lagarde and former U.S. Treasury Secretary Hank Paulson. Coming UpThe Fed’s next rate decision is Thursday.,2018-11-07T13:42:30Z,Yahoo.com,Stocks Climb as Investors Cheer On U.S. Gridlock: Markets Wrap,https://www.yahoo.com/news/asia-stocks-start-mixed-u-220816159.html
4,"A US trade war, Brexit, Italy — you name it, Germany's economy is suffering from it. Europe's economic powerhouse is in one of the longest boom phases of the postwar period, but it's being pounded by a cocktail of international events that ING Economics says … [+2262 chars]","Trade war fears, ongoing Brexit negotiations, and the crisis surrounding Italy’s budget are all dragging on the German economy. Traditionally strong exports have disappointed in four of last six months. New rules on emissions for automakers aren't helping. Eu…",2018-11-08T11:16:43Z,Business Insider,Germany's economy is getting hammered by the rest of the world's problems,https://www.businessinsider.com/germanys-economy-is-getting-hit-by-the-rest-of-the-worlds-problems-2018-11


In [54]:
df.fillna('',inplace=True)

In [141]:
df.isnull().sum()

content        0
description    0
publishedAt    0
source         0
title          0
url            0
dtype: int64

In [55]:
df.to_csv('data/article_info_6000.csv')

### Now, try some NLP stuff

In [56]:
import nltk
from nltk.util import ngrams

from textblob import TextBlob

from collections import Counter
from operator import itemgetter

from nltk.corpus import stopwords

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
import re
import string

#### Documents Cleanup

In [93]:
def cleanup_doc(doc):
    # Replace punctuations with a white space
    doc = re.sub('[%s]' % re.escape(string.punctuation), ' ', doc)
    # Remove all words containing digits
    doc = re.sub('\w*\d\w*', ' ', doc)
    # Keep US as a special term
    doc = re.sub('U.S.', 'US', doc)
    # Change all words into lowercases
    doc = doc.lower()
    # Remove all non English characters
    doc = ' '.join(w for w in nltk.word_tokenize(doc) if w.isalpha())
    # Remove stopwords
    stop = stopwords.words('english')
    stop = set(stop)
    doc = ' '.join(w for w in nltk.word_tokenize(doc) if w not in stop)
    
    return doc

In [94]:
def super_clean(collection):
    clean = []
    for c in collection:
        clean.append(cleanup_doc(c))
    return clean

In [95]:
clean_title = super_clean(df.title)
clean_des = super_clean(df.description)
clean_content = super_clean(df.content)

In [96]:
clean_title[:5]

['trade war spy claims cloud horizon china airshow',
 'airbus china ceo says winner us china trade war',
 'asian shares sink risk sentiment sours pound hits week top',
 'stocks climb investors cheer us gridlock markets wrap',
 'germany economy getting hammered rest world problems']

In [103]:
def find_top_bigrams(collection, num):
    counter = Counter()

    n = 2
    for c in collection:
        words = TextBlob(c).words
        bigrams = ngrams(words, n)
        counter += Counter(bigrams)

    res = []
    for phrase, count in counter.most_common(num):
        res_tuple = (" ".join(phrase), count)
        res.append(res_tuple)
    
    return res

In [104]:
find_top_bigrams(clean_title, 30)

[('trade war', 1052),
 ('us china', 298),
 ('china trade', 289),
 ('wall street', 147),
 ('trump trade', 138),
 ('oil prices', 105),
 ('global markets', 77),
 ('donald trump', 75),
 ('stock market', 75),
 ('asian shares', 69),
 ('war china', 66),
 ('amid trade', 66),
 ('wall st', 62),
 ('trump xi', 56),
 ('us trade', 55),
 ('third quarter', 52),
 ('xi jinping', 50),
 ('iran sanctions', 48),
 ('china us', 48),
 ('trade tensions', 47),
 ('us stocks', 46),
 ('hong kong', 45),
 ('growth slows', 44),
 ('midterm elections', 42),
 ('trade deal', 42),
 ('china says', 42),
 ('global stocks', 42),
 ('trump says', 40),
 ('china xi', 39),
 ('gdp growth', 38)]

In [105]:
find_top_bigrams(clean_des, 30)

[('trade war', 1246),
 ('united states', 468),
 ('donald trump', 466),
 ('us china', 399),
 ('president donald', 360),
 ('china trade', 328),
 ('wall street', 289),
 ('oct reuters', 185),
 ('xi jinping', 178),
 ('third quarter', 169),
 ('us president', 147),
 ('us trade', 133),
 ('oil prices', 133),
 ('president xi', 125),
 ('economic growth', 122),
 ('war united', 118),
 ('war china', 117),
 ('stock market', 113),
 ('trump administration', 108),
 ('new york', 107),
 ('hong kong', 106),
 ('stock markets', 103),
 ('interest rates', 102),
 ('trade tensions', 100),
 ('president trump', 97),
 ('per cent', 97),
 ('war us', 96),
 ('saudi arabia', 91),
 ('trump trade', 90),
 ('states china', 89)]

In [106]:
find_top_bigrams(clean_content, 30)

[('trade war', 1289),
 ('donald trump', 614),
 ('united states', 573),
 ('president donald', 527),
 ('us china', 399),
 ('wall street', 346),
 ('china trade', 311),
 ('new york', 255),
 ('oct reuters', 252),
 ('xi jinping', 224),
 ('us president', 223),
 ('third quarter', 208),
 ('oil prices', 185),
 ('president xi', 184),
 ('hong kong', 178),
 ('us trade', 170),
 ('economic growth', 157),
 ('stock market', 148),
 ('amp p', 146),
 ('war united', 145),
 ('stock markets', 144),
 ('war china', 138),
 ('reuters china', 137),
 ('interest rates', 133),
 ('trump administration', 131),
 ('trade tensions', 129),
 ('midterm elections', 129),
 ('per cent', 114),
 ('saudi arabia', 113),
 ('reuters us', 112)]

In [115]:
def fit_stuff(doc,cv,ngram,max_df,min_df):
    cv.ngram_range=(1,ngram)
    cv.max_df = max_df
    cv.min_df = min_df
    x = cv.fit_transform(doc).toarray()
    x = pd.DataFrame(X, columns=cv.get_feature_names())
    return x

In [118]:
X_content = fit_stuff(clean_content,TfidfVectorizer(),2,0.9,2)
X_des = fit_stuff(clean_des,TfidfVectorizer(),2,0.9,2)
X_title = fit_stuff(clean_title,TfidfVectorizer(),2,0.9,2)
print([X_content.shape,X_des.shape,X_title.shape])

[(6000, 27451), (6000, 20466), (6000, 9171)]


In [123]:
X_content = fit_stuff(clean_content,CountVectorizer(),2,0.9,2)
X_des = fit_stuff(clean_des,CountVectorizer(),2,0.9,2)
X_title = fit_stuff(clean_title,CountVectorizer(),2,0.9,2)
print([X_content.shape,X_des.shape,X_title.shape])

[(6000, 27451), (6000, 20466), (6000, 9171)]


In [146]:
X_des.fillna(0,inplace=True)
X_title.fillna(0,inplace=True)

In [147]:
sum(X_des.isnull().sum() > 0)

0

In [148]:
sum(X_title.isnull().sum() > 0)

0

In [124]:
from sklearn.model_selection import train_test_split

In [149]:
X_train_content, X_test_content = train_test_split(X_content, test_size=0.3, random_state=42)

In [150]:
X_train_des, X_test_des = train_test_split(X_des, test_size=0.3, random_state=42)

In [151]:
X_train_title, X_test_title = train_test_split(X_title, test_size=0.3, random_state=42)

In [152]:
print(X_train_content.shape,X_test_content.shape)

(4200, 27451) (1800, 27451)


### LDA Dimension Reduction

In [154]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [157]:
n_components = 10
lda = LatentDirichletAllocation(n_components=n_components, 
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(X_train_des)

vocab = X_train_des.columns

In [160]:
lda.transform(X_train_des[:5])

array([[ 0.85231244,  0.01640959,  0.01640959,  0.01641082,  0.01640959,
         0.01640961,  0.01640959,  0.01640959,  0.01640959,  0.0164096 ],
       [ 0.81729774,  0.02029853,  0.02029769,  0.02030056,  0.02029769,
         0.020317  ,  0.02029769,  0.02029769,  0.02029769,  0.0202977 ],
       [ 0.01341605,  0.87929619,  0.01340868,  0.01342602,  0.01340868,
         0.01340879,  0.01340868,  0.01340868,  0.01340868,  0.01340957],
       [ 0.49726903,  0.01386723,  0.01386656,  0.01386809,  0.39179618,
         0.01386656,  0.01386656,  0.01386664,  0.01386656,  0.01386656],
       [ 0.63075872,  0.01213444,  0.01213444,  0.01213661,  0.01213447,
         0.01213444,  0.01213444,  0.27216342,  0.01213444,  0.0121346 ]])

In [159]:
for topic in range(n_components):
    print(f"TOPIC {topic}")
    for j in np.argsort(-lda.components_,1)[topic,:10]:
        print(vocab[j])
    print()

TOPIC 0
us
china
trade
war
trade war
global
reuters
markets
stocks
market

TOPIC 1
spies
uses
york times
friends
chinese spies
calls
times
spies often
often
eavesdropping

TOPIC 2
business environment
state chinese
gloom state
amid gathering
gathering gloom
peddling wares
wares china
companies peddling
christmas tree
economy exporters

TOPIC 3
china
president
trade
trump
us
chinese
war
xi
states
united

TOPIC 4
stomach
stomach churning
churning
adored
suddenly soured
smartphones wall
street suddenly
seemingly adored
people love
valley rest

TOPIC 5
box
robot
let
click
know
please
continue
us
durable goods
durable

TOPIC 6
rapeseed
meal
rapeseed meal
di
resume
jakarta
jokowi
ekonomi
dan
boeing

TOPIC 7
fink
fledged trade
fledged
full fledged
offset geopolitical
session helped
brazil
helped offset
war though
inc holiday

TOPIC 8
economy
market
earnings blue
impact
nearly
bear
stock prices
stock
many
little

TOPIC 9
state
pompeo
propaganda
mike pompeo
investment banking
made
leland
part
s

### Sentiment Analysis

### Recommedation System for Readers

### Visualization

In [None]:
plt.bar(phrase_count.keys(),phrase_count.values())