# Load tweets

In [1]:
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')
import spacy
nlp = spacy.load('en_core_web_lg')

In [2]:
input_file="tweets.json"
with open(input_file, encoding='utf8') as f:
    f.readline()
    tweets=list()
    # read the file
    
    for (i, line) in enumerate(f):
        try:
            if line.strip()[-1] == ',':
                data = json.loads(line.strip()[:-1])
            else:
                data = json.loads(line.strip())
                
            tweets.append(data['doc']['text'])
        except:
            continue
    
            

In [3]:
output_file="tweets.txt"
with open(output_file, 'w') as f:
    for line in tweets:
        f.write(line+"\n")
    

In [4]:
len(tweets)

5000

In [5]:
# entities test

doc=nlp(tweets[8])
for ent in doc.ents:
    print(ent.text, ent.label_)

ALP ORG
Greens NORP
USA GPE
US$20m MONEY
NRA ORG


# Preprocessing

In [6]:
import nltk
from nltk import word_tokenize
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
import re

# pre process documents removing stop words, non alpha and lemmatizing
stopwords = list(nltk.corpus.stopwords.words('english')) # extend stop words if necessary
stopwords.extend(['country','money','deal','pay','think','want','like','even','any','election','member','back','people','case','sure','would','could','tell','ad','hey','must','yes','say','much','go','vote','ca','get','many','win','wow','rt','http','billshortenmp','labor','campaign','one','senator','prime''labor','green','party','libs','palmer','anning','capricornia','candidate','landry','michelle','preference','lnp','alp','tweet','liberal','clive','australia','australian','jimmolan','clivefpalmer','fraser','vic','nsw','qld'])


# lemmatize by pos verb, noun or adv
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
        
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'a')
        
    return lemma

def doc_preprocess(corpus, remove_ht_mentions=True,remove_entities=True):
    new_corpus=[]
    for doc in corpus:
        new_doc=[]
        # remove hashtags and mentions
        if (remove_ht_mentions):
            doc=re.sub("(^|\s)(@|#)(\w+)","",doc).strip()
        
        if (remove_entities):
            nlp_doc=nlp(doc)
            for ent in nlp_doc.ents:
                doc=doc.replace(ent.text,"")
        
        for word in word_tokenize(doc):
            new_word=word.lower()
            if word.isalpha() and new_word not in stopwords:
                new_word = lemmatize(new_word)
                if new_word not in stopwords:
                    new_doc.append(new_word)
        new_corpus.append(new_doc)
    return new_corpus




In [7]:
preprocessed_docs=doc_preprocess(tweets)
words=list()
for tweet in preprocessed_docs:
    for word in tweet:
        words.append(word)

# Get frequencies

In [8]:
text=nltk.Text(words)
fdist=nltk.FreqDist(text)
fdist.plot(20)

<Figure size 640x480 with 1 Axes>

In [9]:
text.collocations()

press conference; whole game; game beat; beat form; afraid anywhere;
feel afraid; carbon disaster; conference feel; astound commit; pretty
certain; behind pretty; defend gross; health implication; severe
health; disaster basin; big carbon; implication young; commit big;
speak severe; barbaric speak


In [10]:
text.concordance('ca')

no matches


# Topic model

In [11]:
import gensim as gs
import numpy as np
nltk.download('wordnet')

guess_num_clusters=10

# create a dictionary from the documents
docs_dict=gs.corpora.dictionary.Dictionary(preprocessed_docs)

# generate bow representation for every document
bow_docs = [docs_dict.doc2bow(doc) for doc in preprocessed_docs]

# LDA Topic model
ldamodel = gs.models.ldamodel.LdaModel(bow_docs, num_topics=guess_num_clusters, id2word=docs_dict, passes=15, random_state=np.random.RandomState(42))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danielgil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def pprint_topics(ldamodel, num_words=20):
    topics = ldamodel.print_topics(num_words=num_words)
    word_lists = [(t[0], t[1]) for t in topics]
    word_lists = [(t[0], [w.split('*')[1] for w in t[1].split(' + ')]) for t in word_lists]
    topic_ids = [t[0] for t in word_lists]
    word_lists = [' '.join([w[1:-1] for w in t[1]]) for t in word_lists]
    for t_id, w_list in zip(topic_ids, word_lists):
        print('%d:\t%s' % (t_id, w_list))

pprint_topics(ldamodel, num_words=20)

0:	whole form politician look labor mine man work death via new make everyone cost versus time project billshortenmp report cut
1:	fail try make without lose really fact happen hope complete present remember shorten fight school murder independent destroy find right
2:	pretty water certain take senator condemnation international human town still toxic supporter coalition let tax group bottle help hard sight
3:	game amp speak follow great young health defend stand comment barbaric ask gross implication severe face good oh man vile
4:	give bank behind interest agree threat include boast therefore heck wonder stuff provide solution open receive else primary work another
5:	time job show senator please seek last coalition seat amp plan glad good owe know right abolish unfair activity step
6:	need policy yet see know public run change stop welcome ever ban die climate rude keep commitment well power u
7:	beat support press conference feel afraid indicate anywhere taxpayer delete save appear

## Topic visualization

In [13]:
import pyLDAvis
from pyLDAvis import gensim

# plot our first guess
vis_guess = pyLDAvis.gensim.prepare(ldamodel, bow_docs, docs_dict)
pyLDAvis.display(vis_guess)