In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

unable to import 'smart_open.gcs', disabling that module


In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#print(df.target_names.unique())
#df.head()

df = pd.read_csv('./generated/maligan.csv', quoting = 3).dropna().reset_index(drop=True)

In [4]:
df.columns

Index(['NARRATIVE'], dtype='object')

In [5]:
word_replace_dic = {
        ' vic ':' victim ',
        ' vict ':' victim ',
        ' sus ': ' suspect ',
        ' s1 ': ' suspect ',
        ' v1 ': ' victim ',
        ' loc ': ' location ',
        ' veh ':' vehicle ',
        ' prop ': ' property ',
        ' unk ': ' unknown ',
        ' v ' : ' victim ',
        ' s ': ' suspect ',
        ' ss ': ' suspect ',
        ' susp ': ' suspect ',
        ' remvd ': ' removed ',
        ' victs ': ' victims ',
        ' susps ': ' suspects ',
        ' stillinside ': ' still inside ',
        ' donttell ': ' dont tell ',
        ' veerbal ': ' verbal ',
        ' thransaction ': ' transaction ',
        ' usedcredit ': ' used credit ',
        ' beerbottle ' : ' beer bottle ',
        ' neg ': ' negative ',
        ' cointinuously ': ' continuously ',
        ' lemmon ': ' lemon ',
        ' att ': ' attack '
            }
df['content'] = " " + df['NARRATIVE'].str.lower()+ " "
#df['content'] = df.cleaned.replace(word_replace_dic)#, regex=True)

In [6]:

for key, value in word_replace_dic.items():
    df['content'] = df['content'].str.replace(key, value, 10)


In [7]:
df['NARRATIVE'][2673]

'elm speed kitty hunters '

In [8]:
df['content'][2673]

' elm speed kitty hunters  '

In [9]:

for key, value in word_replace_dic.items():
    df['content'] = df['content'].str.replace(key, value, 10)

In [10]:
df[df['content'].str.contains(" vict ")]['content']
#df.columns #= ['DR', 'CRIMETYPE', 'BEGDATE', 'content', 'X', 'Y', 'NARRATIVE_FLAG']

Series([], Name: content, dtype: object)

In [11]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

print(data[:1])

[' elm gas tank cell charger from suspect suspect neglected of conclusion for items ']


### Tokenize words and Clean-up text

In [12]:

#I have set deacc=True to remove the punctuations.

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['elm', 'gas', 'tank', 'cell', 'charger', 'from', 'suspect', 'suspect', 'neglected', 'of', 'conclusion', 'for', 'items']]


In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['elm', 'gas_tank', 'cell', 'charger', 'from', 'suspect', 'suspect', 'neglected', 'of', 'conclusion', 'for', 'items']


In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['cell', 'charger', 'suspect', 'suspect', 'neglect', 'conclusion', 'item']]


### Create the Dictionary and Corpus needed for Topic Modeling

In [16]:
#The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)]]


In [17]:
id2word[0]

'cell'

In [18]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('cell', 1),
  ('charger', 1),
  ('conclusion', 1),
  ('item', 1),
  ('neglect', 1),
  ('suspect', 2)]]

In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.112*"anzac" + 0.074*"area" + 0.053*"register" + 0.053*"smash" + '
  '0.036*"yell" + 0.032*"building" + 0.032*"run" + 0.029*"miss" + '
  '0.028*"service" + 0.025*"damage"'),
 (1,
  '0.292*"victim" + 0.275*"suspect" + 0.031*"leave" + 0.020*"state" + '
  '0.017*"punch" + 0.016*"approach" + 0.014*"hit" + 0.012*"car" + '
  '0.012*"return" + 0.011*"time"'),
 (2,
  '0.141*"location" + 0.138*"vehicle" + 0.112*"open" + 0.074*"front" + '
  '0.062*"rear" + 0.033*"cut" + 0.025*"acct" + 0.019*"come" + 0.018*"pick" + '
  '0.018*"unlock"'),
 (3,
  '0.063*"room" + 0.053*"phone" + 0.043*"purchase" + 0.043*"attempt" + '
  '0.034*"several" + 0.032*"push" + 0.032*"grab" + 0.029*"lock" + 0.023*"ball" '
  '+ 0.022*"multiple"'),
 (4,
  '0.174*"soccer" + 0.099*"check" + 0.074*"wit" + 0.058*"person" + '
  '0.035*"discover" + 0.029*"closet" + 0.025*"rent" + 0.017*"tv" + '
  '0.016*"proceeding" + 0.016*"night"'),
 (5,
  '0.112*"flee" + 0.097*"property" + 0.093*"elm" + 0.077*"door" + '
  '0.070*"unknown

In [21]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.036059394187875

Coherence Score:  0.39582160249426307


In [22]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [23]:

# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

#mallet_path = 'F:\\mallet-2.0.8\\bin\\mallet'
mallet_path = "F:\\mallet-2.0.8\\bin\\mallet"
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=12, id2word=id2word)

In [24]:
# Show Topics
pprint(ldamallet.show_topics(num_topics=12, formatted=False))

[(0,
  [('suspect', 0.18433704453441296),
   ('leave', 0.11576417004048584),
   ('item', 0.06427125506072874),
   ('store', 0.04200404858299595),
   ('location', 0.03909412955465587),
   ('attempt', 0.03833502024291498),
   ('bag', 0.028972672064777327),
   ('place', 0.02618927125506073),
   ('register', 0.02543016194331984),
   ('walk', 0.023785425101214574)]),
 (1,
  [('elm', 0.15793538421933326),
   ('suspect', 0.07388338267473292),
   ('room', 0.03912987514480628),
   ('school', 0.0382288582829193),
   ('date', 0.03462479083537135),
   ('area', 0.03269404041704209),
   ('top', 0.03192174024971039),
   ('complex', 0.0214956879907324),
   ('residence', 0.016604453597631613),
   ('apartment', 0.015703436735744626)]),
 (2,
  [('suspect', 0.1934060228452752),
   ('victim', 0.1043613707165109),
   ('dispute', 0.04036863966770509),
   ('strike', 0.03050363447559709),
   ('business', 0.02699896157840083),
   ('violation', 0.0264797507788162),
   ('verbal', 0.02518172377985462),
   ('elm', 

In [25]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
# Compute Perplexity
#print('\nPerplexity: ', ldamallet.log_perplexity(corpus))  # a measure of how good the model is. lower the better.



Coherence Score:  0.37153656728004486


In [26]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()



In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

If the coherence score seems to keep increasing, it may make better sense to pick the model that gave the highest CV before flattening out. This is exactly the case here.

So for further steps I will choose the model with 12 topics itself.


In [None]:
# Select the model and print the topics
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
vis

In [None]:
# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


## Finding the dominant topic in each sentence

One of the practical application of topic modeling is to determine what topic a given document is about.

To find that, we find the topic number that has the highest percentage contribution in that document.

The format_topics_sentences() function below nicely aggregates this information in a presentable table.

In [None]:
def format_two_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                topic_num0 = topic_num
                prop_topic0 = prop_topic
                wp0 = ldamodel.show_topic(topic_num0)
                topic_keywords0 = ", ".join([word for word, prop in wp0])
            elif j == 1:  # => dominant topic
                topic_num1 = topic_num
                prop_topic1 = prop_topic
                wp1 = ldamodel.show_topic(topic_num1)
                topic_keywords1 = ", ".join([word for word, prop in wp1])
            else:
                break
        sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num0), round(prop_topic0,4), topic_keywords0,int(topic_num1), round(prop_topic1,4), topic_keywords1]), ignore_index=True)
    sent_topics_df.columns = ['Dominant_Topic0', 'Perc_Contribution0', 'Topic_Keywords0', 'Dominant_Topic1', 'Perc_Contribution1', 'Topic_Keywords1']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


#df_two_topic_sents_keywords = format_two_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
df_two_topic_sents_keywords = format_two_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

# Format
df_two_dominant_topic = df_two_topic_sents_keywords.reset_index()
df_two_dominant_topic.columns = ['Document_No', 'Dominant_Topic0', 'Topic_Perc_Contrib0', 'Keywords0', 'Dominant_Topic1', 'Topic_Perc_Contrib1', 'Keywords1', 'Text']

# Show
df_two_dominant_topic.head(10)

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


#df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

## Find the most representative document for each topic

Sometimes just the topic keywords may not be enough to make sense of what a topic is about. So, to help with understanding the topic, you can find the documents a given topic has contributed to the most and infer the topic by reading that document. 

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

In [None]:
sent_two_topics_sorteddf_mallet = pd.DataFrame()

sent_two_topics_outdf_grpd = df_two_topic_sents_keywords.groupby('Dominant_Topic0')

for i, grp in sent_two_topics_outdf_grpd:
    sent_two_topics_sorteddf_mallet = pd.concat([sent_two_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution0'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_two_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
#sent_two_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_two_topics_sorteddf_mallet.head()

## Topic distribution across documents

Finally, we want to understand the volume and distribution of topics in order to judge how widely it was discussed. The below table exposes that information.

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

In [None]:
# Show
df_dominant_topics.head(12)

In [None]:
df_dominant_topic.to_csv('df_dominant_topics_leakgan.csv')