<a href="https://colab.research.google.com/github/hadizh/tweet_topic_modeling/blob/master/tweet_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modeling

## Setting up Colab environment

## Scraping Data

In [0]:
!pip install retry
!pip install GetOldTweets3

In [0]:
import csv
import datetime

import GetOldTweets3 as got
from retry import retry

In [0]:
# Define function to scrape tweets
@retry(SystemExit, delay=60, backoff=2, max_delay=8*60)
def scrape_tweets(company, *, start_date, end_date):
  tweetCriteria = got.manager.TweetCriteria().setUsername(company) \
                                               .setSince(start_date) \
                                               .setUntil(end_date) \
                                               .setLang("en")
  tweets = got.manager.TweetManager.getTweets(tweetCriteria)
  return tweets

# Define function to save company tweets to separate csv files
def save_tweets_to_csv(company, *, start_date, end_date):
    output_file = PROJECT_PATH + '/tweets/{}.csv'.format(company)
    try:
        with open(output_file, "x", encoding="utf8") as out:
            tweets = scrape_tweets(company, start_date=start_date, end_date=end_date)
            csv_file = csv.writer(out, lineterminator='\n')
            csv_file.writerow(['date', 'username', 'to', 'replies', 'retweets',
                               'favorites', 'text', 'geo', 'mentions', 'hashtags',
                               'id', 'permalink'])
            for t in tweets:
                data = [t.date.strftime("%Y-%m-%d %H:%M:%S"),
                        t.username,
                        t.to or '',
                        t.replies,
                        t.retweets,
                        t.favorites,
                        t.text,
                        t.geo,
                        t.mentions,
                        t.hashtags,
                        t.id,
                        t.permalink]
                csv_file.writerow(map(str, data))
            out.flush()
    except FileExistsError:
        pass

def delete_tweets_to_people(company):
  file = PROJECT_PATH + '/tweets/{}.csv'.format(company)
  with open(file, "r") as input_file:
    reader = csv.DictReader(input_file, lineterminator='\n')
    output_rows = [row for row in reader if not row['to']]
  with open(file, "w") as output_file:
    header = ['date', 'username', 'to', 'replies', 'retweets', 'favorites', 
              'text', 'geo', 'mentions', 'hashtags', 'id', 'permalink']
    writer = csv.DictWriter(output_file, fieldnames=header)
    writer.writeheader()
    for row in output_rows:
      writer.writerow(row)

In [0]:
# Open file containing all companies we are interested in
with open(PROJECT_PATH + '/companies.txt') as cf:
    companies = sorted(line.rstrip() for line in cf)
companies

In [0]:
# Save tweets for each company from the past three years
start_date = (datetime.datetime.now() 
              - datetime.timedelta(days=365*3)).strftime("%Y-%m-%d")
end_date = datetime.datetime.now().strftime("%Y-%m-%d")
for company in companies:
  save_tweets_to_csv(company, start_date=start_date, end_date=end_date)
  delete_tweets_to_people(company)

# Show tweets per company
for company in companies:
  file = PROJECT_PATH + '/tweets/{}.csv'.format(company)
  with open(file, "r") as f:
    print("{} : {}".format(company, len(f.readlines()) - 1))


## Cleaning Data

Before we begin performing the topic modeling, we must clean and transform our data. The process of cleaning data makes use of the open source code from this guide: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [0]:
!pip install --upgrade gensim
!pip install --upgrade pyLDAvis

In [0]:
import sqlite3
import pickle
import re
import numpy as np
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [0]:
# Import a corpus of stopwords to use later in lemmatization
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [0]:
# Load tweet data for all companies
def load_tweets(company):
  df = pd.read_csv(PROJECT_PATH + '/tweets/{}.csv'.format(company))
  return df.text.values.tolist()

We first must clean our data by formatting, removing stop words, and lemmatizing.

In [0]:
# Define functions for formatting, stopwords, bigrams, and lemmatization
def format_texts(texts):
    new_lines = re.compile(r'\s+')
    single_quotes = re.compile(r"\'")
    hyperlinks = re.compile(r'http\S+')
    mentions = re.compile(r'\S+@\S+')
    hashtags = re.compile(r'#')
    ampersands = re.compile(r'&amp;')
    camel_case_split = re.compile(r'((?<=[a-z])[A-Z]|(?<=\S)[A-Z](?=[a-z]))')
    
    # Strip newlines, possesives, and hashtags
    texts = [re.sub(new_lines, ' ', str(text)) for text in texts]
    texts = [re.sub(single_quotes, "", str(text)) for text in texts]
    texts = [re.sub(hyperlinks, '', str(text)) for text in texts]
    texts = [re.sub(mentions, '', str(text)) for text in texts]
    texts = [re.sub(hashtags, '', str(text)) for text in texts]
    texts = [re.sub(ampersands, '', str(text)) for text in texts]
    texts = [re.sub(camel_case_split, r' \1', str(text)) for text in texts]
    return texts

def remove_stopwords(texts):
    return [[word for word 
             in simple_preprocess(str(doc), deacc=True, max_len=30) 
             if word not in stop_words] 
            for doc in texts]

def make_ngrams(texts):
    bigram = gensim.models.Phrases(texts, 
                                   min_count=4, 
                                   threshold=0.5, 
                                   scoring=gensim.models.phrases.npmi_scorer)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    nlp = spacy.load('en', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [0]:
# Define function for cleaning tweets
def clean_tweets(company):
    # Format text
    data_words = format_texts(load_tweets(company))

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Make Bigrams
    data_words_ngrams = make_ngrams(data_words_nostops)

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_ngrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    return data_lemmatized

## Modeling and Visualization

To perform the topic modeling, we apply LDA to extract topics. The following analysis makes use of the open source code from this guide: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/. We also visualize our findings, using the open source code from this guide: https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

In [0]:
# Define functions for combining texts and creating a corpus and id2word from the texts
def combine_texts(*args):
    sum_texts = []
    for text in args:
        sum_texts.extend(text)
    return sum_texts

def create_corpus_and_id2word(texts):
    # Create Dictionary
    id2word = corpora.Dictionary(texts)
    id2word.filter_extremes(no_below=2, no_above=0.9)

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    return (corpus, id2word)

### Modeling

We now can build an LDA model. To do this, we use the LDA model from MALLET, and iterate through different models to find a model that has a suitable coherence score.

In [0]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip -o mallet-2.0.8.zip
import os
os.environ["MALLET_HOME"] = "/content/mallet-2.0.8"


In [0]:
# Prepare a sqlite table to store our results
conn = sqlite3.connect(PROJECT_PATH + "/data.db")
cur = conn.cursor()
create_lda_models_table_sql = """
                                CREATE TABLE IF NOT EXISTS lda_models (
                                  id INTEGER PRIMARY KEY,
                                  company TEXT NOT NULL,
                                  num_topics INTEGER NOT NULL,
                                  corpus BLOB,
                                  id2word BLOB,
                                  optimize_interval INTEGER DEFAULT 20,
                                  model BLOB,
                                  coherence REAL NOT NULL,
                                  created_at DATETIME DEFAULT CURRENT_TIMESTAMP
                                );
                              """
cur.execute(create_lda_models_table_sql)
conn.commit()

# Define a function for updating lda models in our sqlite table
def update_lda_models(company, num_topics, corpus, id2word, optimize_interval, model, coherence):
  update_lda_models_sql = """
                            INSERT INTO lda_models
                             (company, num_topics, corpus, id2word, optimize_interval, model, coherence) 
                            VALUES (:company,:num_topics,:corpus,:id2word,:optimize_interval,:model,:coherence);
                          """
  row = {"company": company,
         "num_topics": num_topics,
         "corpus": corpus,
         "id2word": id2word,
         "optimize_interval": optimize_interval,
         "model": model,
         "coherence": coherence}
  cur.execute(update_lda_models_sql, row)
  conn.commit()

In [0]:
# Build LDA model for companies
def compute_coherence_values(*companies, start=2, stop=12, step=2, force=False):
  # Check if we already computed or forced
    company_name = '+'.join(sorted(list(companies)))
    total_companies = pd.read_sql_query("SELECT DISTINCT company FROM lda_models;", conn).company.tolist()
    if company_name in total_companies and force is False:
      return
    msg = "Training for {}".format(
        company_name,
        start,
        stop,
        step,
    )
    print(msg)
    # Get text, corpus, and id2word for companies
    texts = combine_texts(*[clean_tweets(company) for company in companies])
    corpus, id2word = create_corpus_and_id2word(texts)

    # Iterate through the range of num_topics and create mallet models
    mallet_path = 'mallet-2.0.8/bin/mallet'
    optimize_interval = 20
    for num_topics in range(start, stop, step):
        mallet_model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                 corpus=corpus, 
                                                 num_topics=num_topics, 
                                                 id2word=id2word, 
                                                 optimize_interval=optimize_interval)
        coherencemodel = CoherenceModel(model=mallet_model, 
                                        texts=texts, 
                                        dictionary=id2word, 
                                        coherence='c_v')
        coherence = coherencemodel.get_coherence()
        # Add to lda_models db
        corpus_blob = sqlite3.Binary(pickle.dumps(corpus))
        id2word_blob = sqlite3.Binary(pickle.dumps(id2word))
        model_blob = sqlite3.Binary(pickle.dumps(mallet_model))
        update_lda_models(company_name, num_topics, corpus_blob, id2word_blob, 20, model_blob, coherence)

In [0]:
# Create all the needed models
start = 2
stop = 20
step = 1
with open(PROJECT_PATH + "/models.txt", "r") as f:
  models = [line.rstrip().split(',') for line in f]
models
for model in models:
  compute_coherence_values(*model, start=start, stop=stop, step=step)

In [0]:
# Show all distinct companies currently in table
pd.read_sql_query("SELECT DISTINCT company FROM lda_models;", conn)

### Visualization

In [0]:
# Define a function to get latest models from sqlite table as a dataframe
def get_latest_rows(*companies):
  company_name = '+'.join(sorted(list(companies)))
  query = """SELECT a.*
             FROM lda_models AS a
             INNER JOIN (
                SELECT company, num_topics, corpus, id2word, optimize_interval, coherence, MAX(created_at) AS max_created_at
                FROM lda_models
                WHERE company = :company
                GROUP BY company, num_topics
             ) AS b
             ON a.company = b.company
             AND a.num_topics = b.num_topics
             AND a.created_at = b.max_created_at
             ORDER BY a.num_topics ASC
          """
  return pd.read_sql_query(query, conn, params={"company": company_name})

# Define a function to get a specific model for a company
def get_ldamodel_corpus_id2word(*companies, num_topics):
  company_name = '+'.join(sorted(list(companies)))
  query = """SELECT a.*
             FROM lda_models AS a
             INNER JOIN (
                SELECT company, num_topics, corpus, id2word, optimize_interval, coherence, MAX(created_at) AS max_created_at
                FROM lda_models
                WHERE company = :company AND num_topics = :num_topics
                GROUP BY company, num_topics
             ) AS b
             ON a.company = b.company
             AND a.num_topics = b.num_topics
             AND a.created_at = b.max_created_at
             ORDER BY a.num_topics ASC
          """
  row = pd.read_sql_query(query, conn, params={"company": company_name,
                                               "num_topics": num_topics})
  return (gensim.models.wrappers.ldamallet.malletmodel2ldamodel(pickle.loads(row.model.to_list()[0])),
          pickle.loads(row.corpus.to_list()[0]),
          pickle.loads(row.id2word.tolist()[0]))

In [0]:
def show_topic_contributions(ldamodel, corpus):
  # Init output
  sent_topics_df = pd.DataFrame()
  # Get main topic in each document
  for i, row_list in enumerate(ldamodel[corpus]):
      row = row_list[0] if ldamodel.per_word_topics else row_list            
      # print(row)
      row = sorted(row, key=lambda x: (x[1]), reverse=True)
      # Get the Dominant topic, Perc Contribution and Keywords for each document
      for j, (topic_num, _) in enumerate(row):
          if j == 0:  # => dominant topic
              sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), ""]), ignore_index=True)
          else:
              break  
  sent_topics_df.columns = ['Dominant_Topic', 'Topic_Keywords']
  total_count = sent_topics_df.shape[0]

  # Group the dataframe
  grouped_df = sent_topics_df.groupby(['Dominant_Topic'], as_index=False).agg(['count'])
  grouped_df.columns = list(map(''.join, grouped_df.columns.values))
  grouped_df.reset_index(level=0, inplace=True)
  grouped_df.columns = ['Dominant_Topic','Fraction_of_Documents']
  grouped_df['Fraction_of_Documents'] = grouped_df['Fraction_of_Documents'].apply(lambda x: x / total_count)

  # Add keywords to topic
  grouped_df['Topic_Keywords'] = grouped_df['Dominant_Topic'].apply(
      lambda num: ", ".join([word for word, _ in ldamodel.show_topic(int(num))])
  )
  grouped_df['Dominant_Topic'] = grouped_df['Dominant_Topic'].astype(int)
  grouped_df.set_index('Dominant_Topic', inplace=True)
  return grouped_df

In [0]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
import os

# Define a function to generate a word cloud
def generate_word_cloud(ldamodel, *companies, num_topics):
  company_name = '+'.join(sorted(list(companies)))
  topics = ldamodel.show_topics(num_topics=num_topics, formatted=False)
  directory = PROJECT_PATH + '/{}'.format(company_name)
  os.makedirs(directory) if not os.path.isdir(directory) else None
  for i in range(len(topics)):
      plt.figure()
      topic_words = dict(topics[i][1])
      cloud = WordCloud(prefer_horizontal=1.0, 
                        background_color="white", 
                        relative_scaling=0.5,
                        colormap="Dark2",
                        max_words=10)
      cloud = cloud.generate_from_frequencies(topic_words)
      plt.gca().imshow(cloud)
      plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
      plt.gca().axis('off')
      plt.tight_layout()
      plt.show()
      # Save to file
      save_file = directory + '/topic_{}_word_cloud.png'.format(i)
      if not os.path.isfile(save_file):
        cloud.to_file(save_file)

In [0]:
from collections import Counter

# Define a function generate word counts/word weights within a topic
def generate_word_counts(ldamodel, *companies, num_topics):
  company_name = '+'.join(sorted(list(companies)))
  topics = ldamodel.show_topics(num_topics=num_topics, formatted=False)
  texts = combine_texts(*[clean_tweets(company) for company in companies])
  flat_texts = [w for w_list in texts for w in w_list]
  counter = Counter(flat_texts)

  out = []
  for i, topic in topics:
      for word, weight in topic:
          out.append([word, i , weight, counter[word]])
  return pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])

# Define a function to generate a bar graph from word counts
def generate_word_count_bar_graph(df, *companies, num_topics):
  company_name = '+'.join(sorted(list(companies)))
  directory = PROJECT_PATH + '/{}/word_distributions'.format(company_name)
  os.makedirs(directory) if not os.path.isdir(directory) else None
  cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
  for i in range(0, num_topics): # TODO len topics
    plt.figure()
    ax = plt.gca()
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i % len(cols)], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i % len(cols)], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i % len(cols)])

    # Set scaled ylims
    i_df = df.loc[df['topic_id'] == i]
    ax_twin.set_ylim(0, 1.2*i_df.importance.max())
    ax.set_ylim(0, 1.2*i_df.word_count.max())

    ax.set_title('Topic: ' + str(i), color=cols[i % len(cols)], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
    # Save to file
    save_file = directory + '/topic_{}_word_distribution.png'.format(i)
    if not os.path.isfile(save_file):
      plt.savefig(save_file, bbox_inches='tight')
    plt.show()

In [0]:
pyLDAvis.enable_notebook()

# Define a function for running pyLDAvis and saving the internal svg
def run_pyLDAvis(*companies, num_topics):
  company_name = '+'.join(sorted(list(companies)))
  directory = PROJECT_PATH + '/{}/pyLDAvis'.format(company_name)
  os.makedirs(directory) if not os.path.isdir(directory) else None

  vis = pyLDAvis.gensim.prepare(*get_ldamodel_corpus_id2word(*companies, num_topics=num_topics), sort_topics=False)
  # Save to file
  save_file = directory + '/{}.html'.format(company_name)
  if not os.path.isfile(save_file):
    pyLDAvis.save_html(vis, save_file)
  
  # TODO: Save svg inside to a separate pdf file
  return vis