In [1]:
# import
import pandas as pd
import numpy as np
import re
import string
import pickle
import nltk
from time import time
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
def custom_tokenizer(text):
    # tokenize
    tokens = word_tokenize(text)

    # remove useless words by tags
    tokens_tag = nltk.pos_tag(tokens)
    noun_tags = ['NN','NNS','NNP','NNPS']
    verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    useless_tags = ['CC','CD','DT','EX','IN','MD',
                'PDT','POS','PRP$',
                'TO','UH','WDT','WP','WP$','WRB']
    useful_tags = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
    #tokens_filt = [j for i, j in enumerate(tokens) if tokens_tag[i][1] not in useless_tags]
    tokens_filt = [j for i,j in enumerate(tokens) if tokens_tag[i][1] in noun_tags+verb_tags]

    # remove stop words
    depun_gen_stop = [i.replace("'","") for i in stopwords.words('english')]
    general_stopwords = list(set(stopwords.words('english') + depun_gen_stop))

    web_stopwords = ['html','via','youtube','rt','twitter','tweet','tweets']
    domain_stopwords = ['donald','trade','war','tradewar',
                        'realdonaldtrump','trump','trumps','dtj','djt']
    media_stopwords = ['nyt','reuters','video','news','bloomberg','wsj','cnn','medium',
                       'ft','businessinsider','nytimes']
    gabage_stopwords = ['g','doesnt','e','he','youre','dont','thats','could',
                        'really','would','may','much','many','everything','any',
                        'get','everyone']

    stop_words = general_stopwords + web_stopwords + domain_stopwords + media_stopwords + gabage_stopwords

    tokens_stop = [i for i in tokens_filt if i not in stop_words]

    # lemmatize
    wnl = WordNetLemmatizer()
    tokens_lemma = [wnl.lemmatize(y) for y in tokens_stop]

    return " ".join(tokens_lemma)

In [3]:
# read table into dataframe
print("Loading dataset...")
t0 = time()
with open('02_documents_en.pickle', 'rb') as handle:
    trade_war_documents = pickle.load(handle)
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 0.597s.


In [4]:
# tokenize
print("Tokenizing and joining...")
t0 = time()
trade_war_documents = trade_war_documents.apply(custom_tokenizer)
print("done in %0.3fs." % (time() - t0))

Tokenizing and joining...
done in 1909.197s.


In [5]:
# pickling
print("Pickling")
t0 = time()
with open('03_tokentext_en.pickle', 'wb') as handle:
    pickle.dump(trade_war_documents, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("done in %0.3fs." % (time() - t0))

Pickling
done in 1.638s.
