# 1.0 Topic Modelling

In [2]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import pandas as pd
from timeit import default_timer as timer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):


In [3]:
dataset = pd.read_csv('event_mentions_text_en.csv', encoding='latin-1').sort_values('EventTimeDate', ascending=1)
dataset.columns

  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


Index(['GLOBALEVENTID', 'EventTimeDate', 'MentionTimeDate', 'MentionType',
       'MentionSourceName', 'MentionIdentifier', 'SentenceID',
       'Actor1CharOffset', 'Actor2CharOffset', 'ActionCharOffset', 'InRawText',
       'Confidence', 'MentionDocLen', 'MentionDocTone',
       'MentionDocTranslationInfo', 'Extras', 'Authors', 'Publised', 'Text',
       'Keywords', 'Summaries', 'Images', 'Movies'],
      dtype='object')

# 2.0 English Model

In [4]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 1)
s = timer()
dtm_tf = tf_vectorizer.fit_transform(dataset.Text.dropna())
e = timer()
print(dtm_tf.shape)
print('%s seconds' %(e-s))

(24704, 48262)
30.073984 seconds


In [5]:
s = timer()
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(dataset.Text.dropna())
e = timer()
print(dtm_tfidf.shape)
print('%s seconds' %(e-s))

(24704, 48262)
33.8596937 seconds


In [6]:
# for TF DTM
#lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
#lda_tf.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [7]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

# 3.0 Spanish Model

In [8]:
from nltk.corpus import stopwords
stop = stopwords.words('spanish')

dataset = pd.read_csv('event_mentions_text_es.csv', encoding='latin-1').sort_values('EventTimeDate', ascending=1)

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stop,
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 1)
s = timer()
dtm_tf = tf_vectorizer.fit_transform(dataset.Text.dropna())
e = timer()
print(dtm_tf.shape)
print('%s seconds' %(e-s))

(47899, 87022)
54.61970340000005 seconds


In [9]:
s = timer()
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(dataset.Text.dropna())
e = timer()
print(dtm_tfidf.shape)
print('%s seconds' %(e-s))

(47899, 87022)
53.76178159999995 seconds


In [10]:
# for TF DTM
#lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
#lda_tf.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [11]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)