In [1]:
import pandas as pd
import pickle

tweets = pd.read_pickle('./tweets_tract.pkl')

In [24]:
# Preparing Documents

import re

URL_PATTERN = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

doc_complete = map(lambda text: re.sub(URL_PATTERN, '', text), tweets['text'].tolist())

In [25]:
# Cleaning and Preprocessing

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

In [26]:
# Preparing Document-Term Matrix

import gensim
from gensim import corpora

dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [27]:
# Running LDA model

import time
Lda = gensim.models.ldamodel.LdaModel
start = time.time()
ldamodel = Lda(doc_term_matrix, num_topics=25, id2word = dictionary, passes=20)
print time.time() - start

5143.28146315


In [28]:
print(ldamodel.print_topics(num_topics=5, num_words=10))

[(3, u'0.052*"city" + 0.049*"salt" + 0.036*"lake" + 0.035*"come" + 0.026*"mango" + 0.024*"it" + 0.015*"try" + 0.015*"year" + 0.014*"lamb" + 0.013*"shrimp"'), (12, u'0.081*"today" + 0.072*"2016" + 0.057*"pressure" + 0.057*"rain" + 0.054*"trump" + 0.054*"orchard" + 0.054*"forecast" + 0.053*"tempcrab" + 0.036*"000in" + 0.030*"fine"'), (23, u'0.042*"best" + 0.042*"u" + 0.034*"need" + 0.020*"cold" + 0.018*"coffee" + 0.017*"it" + 0.016*"ever" + 0.010*"boy" + 0.010*"ive" + 0.009*"hand"'), (19, u'0.116*"orange" + 0.049*"ca" + 0.039*"great" + 0.035*"job" + 0.024*"anyone" + 0.021*"walnut" + 0.021*"you" + 0.020*"im" + 0.020*"recommend" + 0.019*"hill"'), (16, u'0.155*"coffee" + 0.050*"taco" + 0.026*"morning" + 0.020*"go" + 0.019*"bell" + 0.017*"shop" + 0.015*"love" + 0.014*"drink" + 0.013*"time" + 0.013*"way"')]


In [29]:
from gensim import corpora, models
import pyLDAvis.gensim

In [30]:
tweets_data =  pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.display(tweets_data)

In [32]:
ldamodel.save('ldamodel_25topic.lda')

In [37]:
help(ldamodel.get_document_topics)

Help on method get_document_topics in module gensim.models.ldamodel:

get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False) method of gensim.models.ldamodel.LdaModel instance
    Return topic distribution for the given document `bow`, as a list of
    (topic_id, topic_probability) 2-tuples.
    
    Ignore topics with very low probability (below `minimum_probability`).
    
    If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word.
    It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count)



In [67]:
print tweets.loc[1, 'text']
print doc_term_matrix[1]
print ldamodel.get_document_topics(doc_term_matrix[1])

Aesthetics @ Quills Coffee https://t.co/rp1FZKMgem
[(1, 1), (4, 1), (5, 1)]
[(0, 0.01), (1, 0.01), (2, 0.01), (3, 0.01), (4, 0.01), (5, 0.01), (6, 0.01), (7, 0.010000000000106136), (8, 0.01), (9, 0.01), (10, 0.01), (11, 0.01), (12, 0.01), (13, 0.01), (14, 0.01), (15, 0.01), (16, 0.75999999999959056), (17, 0.01), (18, 0.01), (19, 0.01), (20, 0.010000000000099292), (21, 0.01), (22, 0.010000000000108355), (23, 0.010000000000095596), (24, 0.01)]


In [94]:
tweets_lda = tweets.loc[:, ['created_at', 'text', 'lat', 'lon', 'source', 'block']]

In [82]:
topic_dict = (dict(ldamodel.get_document_topics(bow)) for bow in doc_term_matrix)

In [83]:
topic_prob_df = pd.DataFrame.from_dict(topic_dict, dtype='float64')

In [85]:
topic_prob_df.fillna(0, inplace=True)

In [88]:
topic_prob_df.shape

(109386, 25)

In [90]:
tweets_lda.shape

(109386, 6)

In [102]:
tweets_prob = pd.concat([tweets_lda, topic_prob_df], axis=1)

In [103]:
tweets_prob

Unnamed: 0,created_at,text,lat,lon,source,block,0,1,2,3,...,15,16,17,18,19,20,21,22,23,24
0,2016-12-10 15:57:45,Cozy with coffee @ Central Park https://t.co/...,40.772743,-73.972216,Instagram,360610143001034,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.208000,0.138152,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.277848
1,2016-12-10 15:57:50,Aesthetics @ Quills Coffee https://t.co/rp1FZK...,39.779118,-86.163775,Instagram,180973910001024,0.010000,0.010000,0.010000,0.010000,...,0.010000,0.760000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000,0.010000
2,2016-12-10 15:57:53,Sipping coffee thinking about warmer weather.....,30.822630,-88.058797,Instagram,010970056002082,0.000000,0.000000,0.130000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.165571,0.000000,0.000000,0.344429,0.000000
3,2016-12-10 15:58:09,Peach Brandy Prime Pork Chops. #tiffanystapand...,41.493620,-75.577470,Instagram,420691111005019,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.350578,0.000000,0.175022,0.000000,0.086667,0.000000,0.086667,0.000000,0.000000
4,2016-12-10 15:58:30,Can you recommend anyone for this #job? Server...,37.926002,-122.017174,TweetMyJOBS,060133383011000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.519083,0.000000,0.000000,0.000000,0.000000,0.000000
5,2016-12-10 15:58:40,"Is it obvious we are in the Christmas mood, ye...",41.907500,-87.676900,Instagram,170312414001010,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,2016-12-10 15:58:41,This!!!!! Coastal Lobster Roll....one pound Ma...,38.917843,-77.218870,Instagram,510594802022023,0.000000,0.000000,0.000000,0.000000,...,0.420000,0.000000,0.000000,0.086667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,2016-12-10 15:58:43,We're #hiring! Click to apply: Pastry Cook I -...,36.106152,-115.174460,TweetMyJOBS,320030067001023,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.503333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,2016-12-10 15:59:04,Ut oh someone needs his coffee this morning! #...,45.348200,-118.153890,Instagram,410619705003127,0.000000,0.000000,0.000000,0.151727,...,0.310091,0.174587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.196323,0.000000
9,2016-12-10 15:59:13,Abita Vanilla Dog now available on tap. https...,41.311696,-72.933381,Beer Menus,090093614013005,0.115556,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.337778,0.000000,0.000000,0.000000,0.000000


In [104]:
tweets_prob.to_csv('./tweets_prob.csv',
                   index=False,
                   index_label=False)