In [1]:
# import
import pandas as pd
import numpy as np
import re
import string
import pickle
import nltk
from time import time
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
from datetime import datetime
import datetime as dt

In [4]:
# read table into dataframe
print("Loading dataset...")
t0 = time()
with open('03_tokentext_en.pickle', 'rb') as handle:
    trade_war_tokentext = pickle.load(handle)
dataset = trade_war_tokentext
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.346s.


In [5]:
# parameters
n_features = 2000
n_components = 5
n_top_words = 10

In [6]:
# define stopwords
depun_gen_stop = [i.replace("'","") for i in stopwords.words('english')]
general_stopwords = list(set(stopwords.words('english') + depun_gen_stop))

web_stopwords = ['html','via','youtube','rt','twitter','tweet','tweets']
domain_stopwords = ['donald','trade','war','tradewar',
                    'realdonaldtrump','trump','trumps','dtj','djt','president','america','american']
media_stopwords = ['nyt','reuters','video','news','bloomberg','wsj','cnn','medium',
                   'newspaper','insider']
gabage_stopwords = ['g','doesnt','e','he','youre','dont','thats','could',
                    'really','would','may','much','many','everything','any',
                    'get','everyone','going','one','nobody','cant','nothing','lot','think','know',
                    'make','go','warhtml','say','day','week','look','said','want','plan','set','time',
                    'need','see','way','year','thing','people','move','state','take']

stop_words = general_stopwords + web_stopwords + domain_stopwords + media_stopwords + gabage_stopwords

In [7]:
# functions
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [8]:
# tokenization and modeling

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.4,
                                   max_features=n_features,
                                   stop_words=stop_words)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(dataset)
print("done in %0.3fs." % (time() - t0))
print('--------------------------------------')

Extracting tf-idf features for NMF...
done in 11.903s.
--------------------------------------


In [9]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_components=%d..."
      % n_components)
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)
print('--------------------------------------')

Fitting the NMF model (Frobenius norm) with tf-idf features, n_components=5...
done in 34.357s.

Topics in NMF model (Frobenius norm):
Topic #0: china hold talk lose fight winning beijing business escalates history
Topic #1: tariff steel good eu import hit escalates aluminum canada threatens
Topic #2: fear stock market dow point street fall share investor ease
Topic #3: win pompeo canada weapon eu country help lose imf reason
Topic #4: farmer economy hurt started world start job country business soybean

--------------------------------------


In [10]:
# get two metrices

doc_topic = nmf.transform(tfidf)
topic_word = nmf.components_

df_doc_topic = pd.DataFrame(doc_topic)
df_topic_word = pd.DataFrame(topic_word)

In [11]:
# load df

with open('02_df_en.pickle', 'rb') as handle:
    df = pickle.load(handle)
    
df = df.reset_index(drop=True)

In [12]:
# figure out each document's topic, filter out gabage

def assign_topic(indx):
    if doc_topic[indx].sum() == 0:
        return -1
    else:
        return np.argmax(doc_topic[indx])

In [13]:
# assign topics and filter out gabage

df['topic'] = [assign_topic(i) for i in df.index]
df = df[df['topic']!=-1]

In [14]:
# assign topic3 to topic1, topic4 to topic3

def change_topic(i):
    if i == 3:
        return 1
    else:
        return i

def change_topic2(i):
    if i == 4:
        return 3
    else:
        return i


In [15]:
df['topic'] = df['topic'].apply(change_topic)
df['topic'] = df['topic'].apply(change_topic2)

In [16]:
# map topic names
mapp = {0:'China & Fight',
        1:'Tariff & EU, Canada',
        2:'Stock Maket',
        3:'Farmer & Economy & Job'}

def map_name(num):
    return mapp[num]

df['topic_name'] = df['topic'].apply(map_name)

In [17]:
df.head()

Unnamed: 0,username,date,retweets,favorites,text,mentions,hashtags,id,permalink,topic,topic_name
0,WineTasteAddict,2018-03-30 19:57,0,0,trumps trade war with china could hurt califor...,,,979870258382110721,https://twitter.com/WineTasteAddict/status/979...,0,China & Fight
1,WineTasteAddict,2018-03-30 19:57,0,0,trumps trade war with china could hurt califor...,,,979870182263918592,https://twitter.com/WineTasteAddict/status/979...,0,China & Fight
2,chrischandler,2018-03-30 19:51,0,0,how to win a trade war,,,979868626240012288,https://twitter.com/chrischandler/status/97986...,1,"Tariff & EU, Canada"
3,BcabaNetwork,2018-03-30 19:47,0,0,asias small open economies may suffer in ameri...,,,979867757767426048,https://twitter.com/BcabaNetwork/status/979867...,3,Farmer & Economy & Job
4,CharmCity2052,2018-03-30 19:44,0,0,china owns more of our debt than any country i...,,,979867078860689410,https://twitter.com/CharmCity2052/status/97986...,0,China & Fight


In [18]:
# pickling

with open('04_df_en.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# pca plot

temp = df[['username','topic']]
temp = (temp.join(pd.DataFrame(tfidf.toarray()))
                        .drop('username',axis=1))

#y = temp['topic']
X = temp.drop('topic',axis=1)

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
PCAxy = pca.fit_transform(X)

# plt.figure(figsize = (16,12))
# plt.scatter(x[:,0], x[:,1], c=y)
# plt.show()

# plot data with seaborn

data = pd.DataFrame()
data['x']=PCAxy.T[0]
data['y']=PCAxy.T[1]
data['labels']=list(temp['topic'])

facet = sns.lmplot(data=data, x='x', y='y', hue='labels', 
                   fit_reg=False, legend=True, legend_out=True, size=12, aspect=1)

In [None]:
# get unique user names for location scraping

users = df['username'].unique()
with open('users.pickle', 'wb') as handle:
    pickle.dump(users, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# # get the most retweeted users top 1000

# top_re_users = (df.groupby('username',as_index=False).agg({'retweets':sum})
#                                 .sort_values(by='retweets',ascending=False)
#                                 .head(1000)['username'].tolist())
# # pickling

# with open('top_re_users.pickle', 'wb') as handle:
#     pickle.dump(top_re_users, handle, protocol=pickle.HIGHEST_PROTOCOL)