In [None]:
import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import datetime
import re
from textblob import TextBlob


# Topic Clustering

In [None]:
def read_df_timestamps(csv_path):
    df = pd.read_csv(csv_path)
    # Drop invalid timestamps
    valid_ts = df.Timestamp.str[-3:] == "UTC"
    df = df.loc[valid_ts]
    df['Timestamp'] = pd.to_datetime(df.Timestamp, format='%Y-%m-%d %H:%M:%S UTC')
    return df

def remove_url(txt):
    """Replace URLs found in a text string with nothing 
    (i.e. it will remove the URL from the string).

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove urls.

    Returns
    -------
    The same txt string with url's removed.
    """

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

def remove_special_characters(df):
    # Remove RT
    df['clean_text'] = df['clean_text'].str.replace(r'\b[Rr][Tt]\b', '')
    # Remove hashtags
    df['clean_text'] = df['clean_text'].str.replace(r'(#|@)', '')
    return df


def clean_tweet_text(df, lower_case=True):
    """Clean Tweet Text, optionally lowercasing all"""
    if lower_case:
        df['clean_text'] = df['Text'].str.lower().apply(remove_url)
    else:
        df['clean_text'] = df['Text'].apply(remove_url)
    df = remove_special_characters(df)

    return df

In [None]:
test_df = clean_tweet_text(read_df_timestamps('/home/tweets/megafires/clean_csvs/2013-yarnell-hill-az.csv'))
documents = list(test_df['clean_text'].values)

# Gensim prep
From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Corpora_and_Vector_Spaces.ipynb

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

In [None]:
from gensim import corpora

In [None]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]


In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'test.dict'))  # store the dictionary, for future reference
print(dictionary)

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'test.mm'), corpus)  # store to disk, for later use

# Analysis

In [None]:
id2word = dictionary
# load corpus iterator
mm = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'test.mm'))
print(mm)

In [None]:
import gensim
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=1, chunksize=10000, passes=5)

In [None]:
lda.show_topics(10)