# Load tweets

In [18]:
import pandas as pd
import json
import warnings
warnings.filterwarnings('ignore')

In [2]:
input_file="tweets.json"
with open(input_file, encoding='utf8') as f:
    f.readline()
    tweets=list()
    # read the file
    
    for (i, line) in enumerate(f):
        try:
            if line.strip()[-1] == ',':
                data = json.loads(line.strip()[:-1])
            else:
                data = json.loads(line.strip())
                
            tweets.append(data['doc']['text'])
        except:
            continue
    
            

In [3]:
output_file="tweets.txt"
with open(output_file, 'w') as f:
    for line in tweets:
        f.write(line+"\n")
    

In [5]:
len(tweets)

5000

# Preprocessing

In [6]:
import nltk
from nltk import word_tokenize
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()


# pre process documents removing stop words, non alpha and lemmatizing
stopwords = list(nltk.corpus.stopwords.words('english')) # extend stop words if necessary
stopwords.extend(['rt','http'])

# lemmatize by pos verb, noun or adv
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
        
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'a')
        
    return lemma

def doc_preprocess(corpus):
    new_corpus=[]
    for doc in corpus:
        new_doc=[]
        for word in word_tokenize(doc):
            new_word=word.lower()
            if word.isalpha() and new_word not in stopwords:
                new_word = lemmatize(new_word)
                if new_word not in stopwords:
                    new_doc.append(new_word)
        new_corpus.append(new_doc)
    return new_corpus


In [10]:
preprocessed_docs=doc_preprocess(tweets)
words=list()
for tweet in preprocessed_docs:
    for word in tweet:
        words.append(word)

# Get frequencies

In [11]:
text=nltk.Text(words)
fdist=nltk.FreqDist(text)
fdist.most_common(10)

[('clivefpalmer', 1492),
 ('labor', 818),
 ('abcnews', 792),
 ('jessvanvonderen', 779),
 ('abcbrisbane', 767),
 ('green', 746),
 ('auspol', 432),
 ('preference', 424),
 ('party', 389),
 ('money', 348)]

# Topic model

In [12]:
import gensim as gs
import numpy as np
nltk.download('wordnet')

guess_num_clusters=15

# create a dictionary from the documents
docs_dict=gs.corpora.dictionary.Dictionary(preprocessed_docs)

# generate bow representation for every document
bow_docs = [docs_dict.doc2bow(doc) for doc in preprocessed_docs]

# LDA Topic model
ldamodel = gs.models.ldamodel.LdaModel(bow_docs, num_topics=guess_num_clusters, id2word=docs_dict, passes=15, random_state=np.random.RandomState(42))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/danielgil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
def pprint_topics(ldamodel, num_words=20):
    topics = ldamodel.print_topics(num_words=num_words)
    word_lists = [(t[0], t[1]) for t in topics]
    word_lists = [(t[0], [w.split('*')[1] for w in t[1].split(' + ')]) for t in word_lists]
    topic_ids = [t[0] for t in word_lists]
    word_lists = [' '.join([w[1:-1] for w in t[1]]) for t in word_lists]
    for t_id, w_list in zip(topic_ids, word_lists):
        print('%d:\t%s' % (t_id, w_list))

pprint_topics(ldamodel, num_words=10)

0:	alp kieragorden could clivefpalmer certain morrison last stand scott green
1:	amp lnp labor anning candidate member fraser preference landry michelle
2:	pay worker labor say mine clivefpalmer adani job senator qld
3:	clivefpalmer abcnews jessvanvonderen abcbrisbane auspol money sure much pretty think
4:	case would even barriecassidy labor big back julianburnside commit libs
5:	deal palmer preference clive scottmorrisonmp game onenationaus clivefpalmer party billshortenmp
6:	clivefpalmer give want take town taxpayer stuff back get think
7:	party onenationaus paulinehansonoz green attack liberalaus liberal anyone go votesustainable
8:	green labor skynewsaust jimmolan beat whole form politician behind krystaldenapoli
9:	australian go last clivefpalmer show time voter auspol country press
10:	sussanley rorts know right seat step plan pay public auspol
11:	green labor one vote look say work billshortenmp sciencepartyaus good
12:	election man australia get vote ban great people direction 

## Topic visualization

In [19]:
import pyLDAvis
from pyLDAvis import gensim

# plot our first guess
vis_guess = pyLDAvis.gensim.prepare(ldamodel, bow_docs, docs_dict)
pyLDAvis.display(vis_guess)