# Part One: Data Harvesting
## White House Website News
### https://www.whitehouse.gov/news/

In [None]:
import requests 
import bs4 as bs 

from time import sleep
from time import time
from random import randint

import pandas as pd

In [None]:
#Harvest White House Website's News with their Dates, Titles, Urls.
titles = []
dates = []
pages = [str(i) for i in range(1,487)] # When we were doing this project, there were 487 pages of news.
urls =[]
headers = {"Accept-Language": "en-US, en;q=0.5"}
for page in pages:
    source = requests.get('https://www.whitehouse.gov/news/page/'+ page, headers = headers)
    sleep(randint(1,2))
    page_html = bs.BeautifulSoup(source.content, 'html.parser')
    news_containers = page_html.find_all('article')  
    for container in news_containers:
        for title in container.find_all('h2'): #titles
            titles.append(title.text.strip()) 
        for date in container.find_all('p', class_="meta__date"): #dates
            dates.append(date.text.strip())
        for url in container.find_all('h2'): #urls
            for url2 in url.find_all('a'):
                link = url2.get("href")
                urls.append(link)

In [None]:
#Harvest all full-texts from Urls acquired in the first step.
texts = []
for url in urls:
    source = requests.get(url)
    sleep(randint(1,2))
    page_html = bs.BeautifulSoup(source.content, 'html.parser')
    news_containers = page_html.find('div', class_="page-content__content editor")
    clean = news_containers.text.replace('Share:','') # remove the unwanted texts
    clean = clean.replace('share-this-page-on-facebook', '') # remove the unwanted texts
    clean = clean.replace('share-this-page-on-twitter', '') # remove the unwanted texts
    clean = clean.replace('copy-url-to-your-clipboard','') # remove the unwanted texts
    clean = clean.replace('All News', '').strip() # remove the unwanted texts
    clean = clean.replace("\n",'') # remove the unwanted texts
    if '\xa0' in clean: # remove the unwanted texts
        clean = clean.replace('\xa0','')
        texts.append(clean)
    else:texts.append(clean)

In [None]:
#Create a dictionary from which a dataframe is created subsequently.
pd_us_gov_total = {
    'dates' : dates,
    'titles' : titles,
    'urls' : urls,
    'texts' : texts
}
pd_us_gov_news_total = pd.DataFrame.from_dict(pd_us_gov_total, orient = 'index').transpose()

## State Council of China Website News
### http://english.gov.cn/news/topnews/

In [None]:
import requests 
import bs4 as bs 

from time import sleep
from time import time
from random import randint

import pandas as pd

In [None]:
#Harvest White House Website's News with their Dates, Titles, Urls.
titles = []
dates = []
urls = []
texts = []


headers = {"Accept-Language": "en-US, en;q=0.5"}

source = requests.get("http://english.gov.cn/news/topnews/", headers = headers) 

page_html = bs.BeautifulSoup(source.content, 'html.parser')

#display=block
news_containers = page_html.find_all('div', style = 'display:block') 
for container in news_containers:

    for title in container.find_all('a'): 
        titles.append(title.text.strip())
    for date in container.find_all('span'):
        dates.append(date.text.strip())
    for url in container.find_all('a'):
        link = "http://english.gov.cn" + url.get('href')
        urls.append(link)

#display=none
news_containers2 = page_html.find_all('div', style = 'display:none')        
for container in news_containers2:

    for title in container.find_all('a'): 
        titles.append(title.text.strip())
    for date in container.find_all('span'):
        dates.append(date.text.strip())
    for url in container.find_all('a'):
        link = "http://english.gov.cn" + url.get('href')
        urls.append(link)

In [None]:
#Harvest all full-texts from Urls acquired in the first step.
for url in new_urls:
    try: # There are some invalid urls, so try/except statements are used to skip them.
        source = requests.get(url)
        page_html = bs.BeautifulSoup(source.content, 'html.parser')
        news_containers = page_html.find('content')
        new_texts.append(news_containers.text)
    except:
        text = 'no text'
        new_texts.append(text)

In [None]:
#Create a dictionary from which a dataframe is created subsequently.
cn_gov_news = {'dates':dates,
                  'titles':titles,
                  'urls':urls,
                  'texts':texts
}
pd_cn_gov_news = pd.DataFrame.from_dict(cn_gov_news, orient = 'index').transpose()

In [None]:
# Save the dataframe into a CSV file.
pd_cn_gov_news.to_csv('cn_gwy_total.csv')

## Part two: Data Cleaning
### White House
### Titles

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('white_house_news.csv')
df1_titles = pd.DataFrame(df1['titles'], columns=['titles'])
In [3]:
df1_titles['titles'] = df1_titles['titles'].fillna('')
df1_titles['titles'] = df1_titles['titles'].str.replace('\n','')
df1_titles['titles'] = df1_titles['titles'].str.replace('\t','')
df1_titles['titles'] = df1_titles['titles'].str.replace('-',' ')
df1_titles['titles'] = df1_titles['titles'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df1_titles['titles'] = df1_titles['titles'].str.replace('[^\w\s]','')
In [4]:
import numpy as np
df1_titles['titles'] = df1_titles['titles'].str.replace('\d+', '')
df1_titles['titles'].replace(' ', np.nan, inplace=True)
In [5]:
from textblob import Word
df1_titles['titles'] = df1_titles['titles'].apply(lambda x: " ".join([Word(word).lemmatize('v') for word in x.split()]))
df1_titles['titles'] = df1_titles['titles'].apply(lambda x: " ".join([Word(word).lemmatize('n') for word in x.split()]))
In [6]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
df1_titles['titles'] = df1_titles['titles'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
In [7]:
df1_titles['titles'] = df1_titles['titles'].str.findall('\w{2,}').str.join(' ')
In [8]:
df1_titles.head()

### fulltext

In [None]:
df1_texts = pd.DataFrame(df1['texts'], columns=['texts'])
In [10]:
df1_texts['texts'] = df1_texts['texts'].fillna("")
df1_texts['texts'] = df1_texts['texts'].str.replace('\n',' ')
df1_texts['texts'] = df1_texts['texts'].str.replace('\t',' ')
df1_texts['texts'] = df1_texts['texts'].str.replace('[^\w\s]',' ')
df1_texts['texts'] = df1_texts['texts'].str.replace('[ ]{2,}',' ')
In [11]:
df1_texts['texts'] = df1_texts['texts'].apply(lambda x: " ".join(x.lower() for x in x.split()))
In [12]:
import numpy as np
df1_texts['texts'] = df1_texts['texts'].str.replace('\d+', '')
df1_texts['texts'].replace(' ', np.nan, inplace=True)
In [13]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
df1_texts['texts'] = df1_texts['texts'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
In [14]:
from textblob import Word
df1_texts['texts'] = df1_texts['texts'].apply(lambda x: " ".join([Word(word).lemmatize('v') for word in x.split()]))
df1_texts['texts'] = df1_texts['texts'].apply(lambda x: " ".join([Word(word).lemmatize('n') for word in x.split()]))
In [15]:
df1_texts['texts'] = df1_texts['texts'].str.findall('\w{2,}').str.join(' ')
In [16]:
df1_texts.head()

In [None]:
### date

In [None]:
###Create a cleaned dataframe
df1_dates = df1.dates
In [18]:
date_series1 = pd.to_datetime(df1_dates)
In [19]:
date_series1 = date_series1.dt.date
In [20]:
date_series1.head()

In [None]:
e = date_series2.tolist()
f = df2_titles['titles'].tolist()
g = df2_texts['texts'].tolist()
pd_cn_news_total = {'dates':e,'titles':f,'texts':g}
In [46]:
CHN = pd.DataFrame.from_dict(pd_cn_news_total, orient='index').transpose()
In [47]:
CHN.to_csv('CHN_Cleaned.csv', encoding='utf-8')
In [48]:
CHN.head()

# data exploration

In [None]:
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb

from collections import Counter

%matplotlib inline
In [2]:
datafile1 = 'USA_Cleaned.csv'
raw_data1 = pd.read_csv(datafile1, parse_dates=[1], infer_datetime_format=True)
In [3]:
to_drop1 = ['Unnamed: 0','texts']
raw_data1.drop(to_drop1, inplace=True, axis=1)
In [4]:
reindexed_data1 = raw_data1['titles'].fillna('')
reindexed_data1.index = raw_data1['dates']
In [5]:
def get_top_n_words(n_top_words, count_vectorizer, text_data):
    vectorized_headlines = count_vectorizer.fit_transform(text_data.as_matrix())
    
    vectorized_total = np.sum(vectorized_headlines, axis=0)
    word_indices = np.flip(np.argsort(vectorized_total)[0,:], 1)
    word_values = np.flip(np.sort(vectorized_total)[0,:],1)
    
    word_vectors = np.zeros((n_top_words, vectorized_headlines.shape[1]))
    for i in range(n_top_words):
        word_vectors[i,word_indices[0,i]] = 1

    words = [word[0].encode('ascii').decode('utf-8') for word in count_vectorizer.inverse_transform(word_vectors)]

    return (words, word_values[0,:n_top_words].tolist()[0])
In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
words, word_values = get_top_n_words(n_top_words=20, count_vectorizer=count_vectorizer, text_data=reindexed_data1)

fig, ax = plt.subplots(figsize=(24,8))
ax.bar(range(len(words)), word_values)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words)
ax.set_title('Top 20 Words in Titles: White House')


In [None]:
monthly_counts = reindexed_data1.resample('M').count()
daily_counts = reindexed_data1.resample('D').count()

fig, ax = plt.subplots(2, figsize=(24,16))
ax[0].plot(daily_counts)
ax[0].set_title('White House News Daily Counts')
ax[1].plot(monthly_counts)
ax[1].set_title('White House News Monthly Counts')


In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
In [12]:
df_usa = pd.read_csv('USA_Cleaned.csv')
df_usa_titles = pd.DataFrame(df_usa['titles'], columns=['titles'])
df_usa_titles['titles'] = df_usa['titles'].fillna('')
In [13]:
text_usa = " ".join(title for title in df_usa_titles['titles'])
In [14]:
wordcloud = WordCloud(width=1600,height=800,max_words=100,background_color="white",collocations=False).generate(text_usa)
plt.figure( figsize=(20,10))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
##LDA
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

In [None]:
df_usa = pd.read_csv('USA_Cleaned.csv',encoding='utf-8')
In [17]:
df_usa_texts = df_usa.texts.fillna('')
In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
In [19]:
tokenized_usa_texts = df_usa_texts.apply(lambda x: x.split())
In [20]:
dictionary_usa = gensim.corpora.Dictionary(tokenized_usa_texts)
In [21]:
bow_corpus_usa = [dictionary_usa.doc2bow(doc) for doc in tokenized_usa_texts]
In [22]:
from gensim import corpora, models
tfidf_usa = models.TfidfModel(bow_corpus_usa)
corpus_tfidf_usa = tfidf_usa[bow_corpus_usa]
In [23]:
#LDA using bag of words
lda_model_usa = gensim.models.LdaMulticore(bow_corpus_usa, num_topics=6, id2word=dictionary_usa, passes=5, workers=2)
In [24]:
for idx, topic in lda_model_usa.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
#LDA using tf-idf
lda_model_tfidf_usa = gensim.models.LdaMulticore(corpus_tfidf_usa, num_topics=6, id2word=dictionary_usa, passes=5, workers=2)
In [26]:
for idx, topic in lda_model_tfidf_usa.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))