In [228]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [229]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [230]:
# Import Dataset
df = pd.read_csv('./la_text_data.csv', quoting = 3).dropna().reset_index(drop=True)

In [231]:
word_replace_dic = {
        ' vic ':' victim ',
        ' vict ':' victim ',
        ' sus ': ' suspect ',
        ' s1 ': ' suspect ',
        ' v1 ': ' victim ',
        ' loc ': ' location ',
        ' veh ':' vehicle ',
        ' prop ': ' property ',
        ' unk ': ' unknown ',
        ' v ' : ' victim ',
        ' s ': ' suspect ',
        ' ss ': ' suspect ',
        ' susp ': ' suspect ',
        ' remvd ': ' removed ',
        ' victs ': ' victims ',
        ' susps ': ' suspects ',
        ' stillinside ': ' still inside ',
        ' donttell ': ' dont tell ',
        ' veerbal ': ' verbal ',
        ' thransaction ': ' transaction ',
        ' usedcredit ': ' used credit ',
        ' beerbottle ' : ' beer bottle ',
        ' neg ': ' negative ',
        ' cointinuously ': ' continuously ',
        ' lemmon ': ' lemon ',
        ' att ': ' attack '
            }
df['content'] = " " + df['NARRATIVE'].str.lower()+ " "

In [232]:
for key, value in word_replace_dic.items():
    df['content'] = df['content'].str.replace(key, value, 10)

In [11]:
df['NARRATIVE'][2673]

'SUSPS REPEATEDLY CALL VICT VICT AHS ASKED SUSPS TO STOP CALLING HIM WITH NEG RESULTS SUSPS PHONE CALLS ARE ANNOYING TO VICT'

In [12]:
df['content'][2673]

' suspects repeatedly call victim victim ahs asked suspects to stop calling him with negative results suspects phone calls are annoying to victim '

In [233]:
for key, value in word_replace_dic.items():
    df['content'] = df['content'].str.replace(key, value, 10)

In [234]:
df[df['content'].str.contains(" vict ")]['content']

Series([], Name: content, dtype: object)

In [235]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

print(data[:1])

[' suspect continiously calls and text victim harrassing messages victim has repeatedly advised suspect to stop making contact ']


### Tokenize words and Clean-up text

In [236]:
#I have set deacc=True to remove the punctuations.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['suspect', 'continiously', 'calls', 'and', 'text', 'victim', 'harrassing', 'messages', 'victim', 'has', 'repeatedly', 'advised', 'suspect', 'to', 'stop', 'making', 'contact']]


In [237]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['suspect', 'continiously', 'calls', 'and', 'text', 'victim', 'harrassing', 'messages', 'victim', 'has', 'repeatedly', 'advised', 'suspect', 'to', 'stop', 'making', 'contact']


In [238]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [239]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['suspect', 'continiously', 'call', 'text', 'victim', 'harrasse', 'message', 'victim', 'repeatedly', 'advise', 'suspect', 'stop', 'make', 'contact']]


### Create the Dictionary and Corpus needed for Topic Modeling

In [240]:
#The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 2)]]


In [21]:
id2word[0]

'advise'

In [22]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('advise', 1),
  ('call', 1),
  ('contact', 1),
  ('continiously', 1),
  ('harrasse', 1),
  ('make', 1),
  ('message', 1),
  ('repeatedly', 1),
  ('stop', 1),
  ('suspect', 2),
  ('text', 1),
  ('victim', 2)]]

In [23]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, #We have 12 crime type in our real data
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [24]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.124*"go" + 0.123*"state" + 0.061*"get" + 0.049*"kill" + 0.038*"be" + '
  '0.035*"discover" + 0.035*"miss" + 0.033*"come" + 0.027*"wallet" + '
  '0.026*"wall"'),
 (1,
  '0.394*"rear" + 0.141*"attack" + 0.113*"car" + 0.062*"access" + 0.047*"seat" '
  '+ 0.046*"shatter" + 0.046*"slide" + 0.034*"trunk" + 0.011*"various" + '
  '0.008*"bumper"'),
 (2,
  '0.000*"borg" + 0.000*"feudng" + 0.000*"anddont" + 0.000*"responsed" + '
  '0.000*"stabbee" + 0.000*"boxconceale" + 0.000*"pocketsthen" + 0.000*"osver" '
  '+ 0.000*"exsusp" + 0.000*"andvictim"'),
 (3,
  '0.049*"strike" + 0.048*"time" + 0.042*"push" + 0.041*"grab" + '
  '0.037*"approach" + 0.037*"business" + 0.034*"throw" + 0.031*"purse" + '
  '0.030*"head" + 0.030*"attempt"'),
 (4,
  '0.139*"money" + 0.121*"walk" + 0.070*"threaten" + 0.068*"tell" + '
  '0.041*"run" + 0.034*"upset" + 0.032*"steal" + 0.028*"kill" + 0.027*"fail" + '
  '0.024*"metal"'),
 (5,
  '0.139*"pay" + 0.070*"order" + 0.065*"fear" + 0.056*"give" + 0.048*"back" + 

In [25]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -5.946351205832915

Coherence Score:  0.29532926394004894


In [26]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [241]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
#mallet_path = 'F:\\mallet-2.0.8\\bin\\mallet'
mallet_path = "F:\\mallet-2.0.8\\bin\\mallet"
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=12, id2word=id2word)

In [242]:
# Show Topics
pprint(ldamallet.show_topics(num_topics=12, formatted=False))

[(0,
  [('suspect', 0.19432459298264468),
   ('victim', 0.19097576701950458),
   ('money', 0.04425742840255763),
   ('approach', 0.036755144806834665),
   ('foot', 0.02629090322927301),
   ('demand', 0.023585513943366825),
   ('give', 0.022916554725699857),
   ('walk', 0.021108484229756596),
   ('gun', 0.019397130729138682),
   ('approached', 0.019308473483423783)]),
 (1,
  [('suspect', 0.4428832772259859),
   ('victim', 0.23082952555181507),
   ('flee', 0.11880974457930628),
   ('location', 0.0980472415511802),
   ('unknown', 0.03830335836584218),
   ('property', 0.03380920160497643),
   ('scene', 0.0065825195679595026),
   ('left', 0.004336833429398022),
   ('removed', 0.0011430305764159796),
   ('pro', 0.0011332848833162088)]),
 (2,
  [('victim', 0.2325558471313823),
   ('suspect', 0.07606266068122178),
   ('face', 0.04238254452451679),
   ('punch', 0.04176579504735517),
   ('time', 0.03999547095416339),
   ('strike', 0.03944932130916066),
   ('push', 0.029482756324013266),
   ('hit

In [37]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)
# Compute Perplexity
#print('\nPerplexity: ', ldamallet.log_perplexity(corpus))  # a measure of how good the model is. lower the better.



Coherence Score:  0.5327888090297348


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

If the coherence score seems to keep increasing, it may make better sense to pick the model that gave the highest CV before flattening out. This is exactly the case here.

So for further steps I will choose the model with 12 topics itself.


In [None]:
# Select the model and print the topics
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [40]:
model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word)
vis

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [42]:
# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.



Perplexity:  nan


## Finding the dominant topic in each sentence

One of the practical application of topic modeling is to determine what topic a given document is about.

To find that, we find the topic number that has the highest percentage contribution in that document.

The format_topics_sentences() function below nicely aggregates this information in a presentable table.

In [43]:
def format_two_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                topic_num0 = topic_num
                prop_topic0 = prop_topic
                wp0 = ldamodel.show_topic(topic_num0)
                topic_keywords0 = ", ".join([word for word, prop in wp0])
            elif j == 1:  # => dominant topic
                topic_num1 = topic_num
                prop_topic1 = prop_topic
                wp1 = ldamodel.show_topic(topic_num1)
                topic_keywords1 = ", ".join([word for word, prop in wp1])
            else:
                break
        sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num0), round(prop_topic0,4), topic_keywords0,int(topic_num1), round(prop_topic1,4), topic_keywords1]), ignore_index=True)
    sent_topics_df.columns = ['Dominant_Topic0', 'Perc_Contribution0', 'Topic_Keywords0', 'Dominant_Topic1', 'Perc_Contribution1', 'Topic_Keywords1']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


#df_two_topic_sents_keywords = format_two_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
df_two_topic_sents_keywords = format_two_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

# Format
df_two_dominant_topic = df_two_topic_sents_keywords.reset_index()
df_two_dominant_topic.columns = ['Document_No', 'Dominant_Topic0', 'Topic_Perc_Contrib0', 'Keywords0', 'Dominant_Topic1', 'Topic_Perc_Contrib1', 'Keywords1', 'Text']

# Show
df_two_dominant_topic.head(10)

KeyboardInterrupt: 

In [44]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


#df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,9.0,0.1884,"victim, suspect, state, phone, call, kill, dem...",suspect continiously calls and text victim ha...
1,1,8.0,0.186,"victim, face, punch, suspect, time, strike, gr...",suspect approached victim and asked victim to...
2,2,1.0,0.1328,"suspect, vehicle, victim, damage, throw, drive...",suspect approached victims vehicle punctured ...
3,3,8.0,0.1427,"victim, face, punch, suspect, time, strike, gr...",unknown suspect involved in a fight inside cl...
4,4,11.0,0.1837,"suspect, item, store, pay, exit, business, ent...",suspect entered location removed and conceale...
5,5,6.0,0.1,"suspect, victim, lock, leave, return, cut, bik...",unknown suspect cut off two cat conv from vic...
6,6,0.0,0.1269,"unknown, vehicle, window, smash, side, break, ...",unknown suspect used unknown method to enter ...
7,7,2.0,0.158,"victim, suspect, make, card, permission, check...",unknown suspect used victims credit card info...
8,8,2.0,0.1647,"victim, suspect, make, card, permission, check...",suspect uses victims identity without permiss...
9,9,6.0,0.1192,"suspect, victim, lock, leave, return, cut, bik...",3 male asian wearings all blk female asian st...


## Find the most representative document for each topic

Sometimes just the topic keywords may not be enough to make sense of what a topic is about. So, to help with understanding the topic, you can find the documents a given topic has contributed to the most and infer the topic by reading that document. 

In [49]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.to_csv('sent_topics_sorteddf_mallet_real.csv')#.head()

In [50]:
sent_two_topics_sorteddf_mallet = pd.DataFrame()

sent_two_topics_outdf_grpd = df_two_topic_sents_keywords.groupby('Dominant_Topic0')

for i, grp in sent_two_topics_outdf_grpd:
    sent_two_topics_sorteddf_mallet = pd.concat([sent_two_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution0'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_two_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
#sent_two_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_two_topics_sorteddf_mallet.head()

NameError: name 'df_two_topic_sents_keywords' is not defined

## Topic distribution across documents

Finally, we want to understand the volume and distribution of topics in order to judge how widely it was discussed. The below table exposes that information.

In [55]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.head(12)

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,9.0,"victim, suspect, state, phone, call, kill, dem...",86980.0,0.108
1,8.0,"victim, face, punch, suspect, time, strike, gr...",58911.0,0.0731
2,1.0,"suspect, vehicle, victim, damage, throw, drive...",92553.0,0.1149
3,8.0,"victim, face, punch, suspect, time, strike, gr...",8749.0,0.0109
4,11.0,"suspect, item, store, pay, exit, business, ent...",67629.0,0.084
5,6.0,"suspect, victim, lock, leave, return, cut, bik...",60119.0,0.0746
6,0.0,"unknown, vehicle, window, smash, side, break, ...",60525.0,0.0751
7,2.0,"victim, suspect, make, card, permission, check...",45866.0,0.0569
8,2.0,"victim, suspect, make, card, permission, check...",88506.0,0.1099
9,6.0,"suspect, victim, lock, leave, return, cut, bik...",83534.0,0.1037


In [56]:
df_dominant_topics.head(12).to_csv('df_dominant_topics_real.csv')

In [57]:
df_dominant_topic.to_csv('df_dominant_topic_real.csv')

In [60]:
[data[0]]

[' suspect continiously calls and text victim harrassing messages victim has repeatedly advised suspect to stop making contact ']

## Sentence coherence score 

In [243]:
def check(word, list):
    if word in list:
        return 1
    else:
        return 0

In [244]:
def format_topics_sentence(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    total_words = []
    words = []
    words_topics_dics = []
    index = 0
    for cp in corpus:
        total_words.append(0)
        words.append([])
        for id, freq in cp:
            total_words[index]+=freq
            words[index].append(id2word[id])
        values = [0] * len(cp)
        words_topics_dics.append(dict(zip(words[index],values)))
        index +=1
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        #words[i] is list of words of the sentence 
        #need to update words_topics_dics
        words_topics_dic = words_topics_dics[i] 
        
        #row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if(j<12):
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = [word for word, prop in wp]# = ", ".join([word for word, prop in wp]) 
                for key, value in words_topics_dic.items():
                    if key in topic_keywords:
                        words_topics_dic[key]+=1
                #print(prop_topic)
                #print(topic_num)
                #print(wp)
                #print(topic_keywords)
            else:
                break
    return words_topics_dics
sentences = [data[0], data[1], data[2], data[3], data[4]]
sentence_corpus = [corpus[0], corpus[1], corpus[2], corpus[3], corpus[4]]
words_topics_dictionaries = format_topics_sentence(ldamodel=ldamallet, corpus=sentence_corpus, texts=sentences)

In [245]:
def score_calculator(wt_dics):
    scores = []
    index = 0
    for wt_dic in wt_dics:
        total_words = len(wt_dic)
        #print(total_words)
        commons_topics = dict((k, v) for k, v in wt_dic.items() if v > 1)
        unique_topics = dict((k, v) for k, v in wt_dic.items() if v <= 1)
        score = len(unique_topics) - len(commons_topics)
        scores.append(score/total_words)
    return scores
s =  score_calculator(words_topics_dictionaries)        

In [246]:
sentences

[' suspect continiously calls and text victim harrassing messages victim has repeatedly advised suspect to stop making contact ',
 ' suspect approached victim and asked victim to fight suspect hit victim multiple times victim defendedhimself and punched suspect suspect fled location southbound on kingsley from melrose ',
 ' suspect approached victims vehicle punctured victims tires with an unknown sharp tool and fled location in unknown dir ',
 ' unknown suspect involved in a fight inside club hit victim in back of head ',
 ' suspect entered location removed and concealed selected items suspect then fled locvia exit without paying for items ']

In [247]:
words_topics_dictionaries

[{'advise': 0,
  'call': 1,
  'contact': 0,
  'continiously': 0,
  'harrasse': 0,
  'make': 1,
  'message': 0,
  'repeatedly': 0,
  'stop': 0,
  'suspect': 8,
  'text': 0,
  'victim': 7},
 {'suspect': 8,
  'victim': 7,
  'approached': 1,
  'ask': 0,
  'fight': 0,
  'flee': 4,
  'hit': 1,
  'location': 3,
  'multiple': 0,
  'punch': 1,
  'time': 1},
 {'suspect': 8,
  'victim': 7,
  'flee': 4,
  'location': 3,
  'approach': 1,
  'dir': 1,
  'puncture': 0,
  'sharp': 0,
  'tire': 0,
  'tool': 1,
  'unknown': 5,
  'vehicle': 1},
 {'suspect': 8,
  'victim': 7,
  'fight': 0,
  'hit': 1,
  'unknown': 5,
  'club': 0,
  'head': 1,
  'involve': 0},
 {'suspect': 8,
  'flee': 4,
  'location': 3,
  'conceal': 0,
  'entered': 1,
  'exit': 1,
  'item': 1,
  'locvia': 0,
  'pay': 1,
  'remove': 3,
  'select': 0}]

In [248]:
s

[0.6666666666666666,
 0.2727272727272727,
 0.16666666666666666,
 0.25,
 0.2727272727272727]

In [222]:
sentences = [data[0]]
sentence_corpus = [corpus[0]]

In [223]:
sentences

[' suspect continiously calls and text victim harrassing messages victim has repeatedly advised suspect to stop making contact ']

In [224]:
sentence_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 2),
  (10, 1),
  (11, 2)]]

((number of words that are present in only one topic) - (number of words that are from multi topics))/(total number of words in the document)

In [107]:
corpus[0:1]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 2),
  (10, 1),
  (11, 2)]]

In [106]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 2),
 (10, 1),
 (11, 2)]

In [101]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[0:1]]

[[('advise', 1),
  ('call', 1),
  ('contact', 1),
  ('continiously', 1),
  ('harrasse', 1),
  ('make', 1),
  ('message', 1),
  ('repeatedly', 1),
  ('stop', 1),
  ('suspect', 2),
  ('text', 1),
  ('victim', 2)]]

In [249]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("ldamallet")
ldamallet.save(temp_file)


In [250]:
# Load a potentially pretrained model from disk.
saved_model = gensim.models.ldamodel.LdaModel.load(temp_file)

In [181]:
texts

[['suspect',
  'continiously',
  'call',
  'text',
  'victim',
  'harrasse',
  'message',
  'victim',
  'repeatedly',
  'advise',
  'suspect',
  'stop',
  'make',
  'contact'],
 ['suspect',
  'approached',
  'victim',
  'ask',
  'victim',
  'fight',
  'suspect',
  'hit',
  'victim',
  'multiple',
  'time',
  'victim',
  'punch',
  'suspect',
  'suspect',
  'flee',
  'location'],
 ['suspect',
  'approach',
  'victim',
  'vehicle',
  'puncture',
  'victim',
  'tire',
  'unknown',
  'sharp',
  'tool',
  'flee',
  'location',
  'unknown',
  'dir'],
 ['unknown', 'suspect', 'involve', 'fight', 'club', 'hit', 'victim', 'head'],
 ['entered',
  'location',
  'remove',
  'conceal',
  'select',
  'item',
  'suspect',
  'flee',
  'locvia',
  'exit',
  'pay',
  'item'],
 ['unknown', 'suspect', 'cut', 'victim', 'vehicle'],
 ['unknown',
  'suspect',
  'use',
  'unknown',
  'method',
  'enter',
  'victim',
  'vehicle',
  'attempt',
  'punch',
  'vehicle',
  'ignition',
  'dash',
  'victim',
  'propert

In [173]:
# Create a new corpus, made of previously unseen documents.

g_df = pd.read_csv('./generated/seqgan_new.csv', quoting = 3).dropna().reset_index(drop=True)
g_df['content'] = " " + g_df['NARRATIVE'].str.lower()+ " "

for key, value in word_replace_dic.items():
    g_df['content'] = g_df['content'].str.replace(key, value, 10)

In [174]:
# Convert to list
g_data = g_df.content.values.tolist()

# Remove Emails
g_data = [re.sub('\S*@\S*\s?', '', sent) for sent in g_data]
# Remove new line characters
g_data = [re.sub('\s+', ' ', sent) for sent in g_data]
# Remove distracting single quotes
g_data = [re.sub("\'", "", sent) for sent in g_data]
g_data_words = list(sent_to_words(g_data))

# Remove Stop Words
g_data_words_nostops = remove_stopwords(g_data_words)
# Form Bigrams
g_data_words_bigrams = make_bigrams(g_data_words_nostops)
# Do lemmatization keeping only noun, adj, vb, adv
g_data_lemmatized = lemmatization(g_data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(g_data_lemmatized[:1])

# Create Corpus
#texts = data_lemmatized

# Create Dictionary
#id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
#texts = data_lemmatized

# Term Document Frequency
#corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])

other_texts = g_data_lemmatized
other_corpus = [id2word.doc2bow(text) for text in other_texts]

[['unknown', 'suspect', 'email', 'victim', 'state', 'rpt', 'victim', 'discover', 'life', 'letter']]


In [176]:
g_sentences = [g_data[0], g_data[1], g_data[2], g_data[3], g_data[4]]
unseen_doc = [other_corpus[0], other_corpus[1], other_corpus[2], other_corpus[3], other_corpus[4]]
#vector = saved_model[unseen_doc]  # get topic probability distribution for a document
g_words_topics_dictionaries = format_topics_sentence(ldamodel=ldamallet, corpus=unseen_doc, texts=sentences)

In [177]:
s =  score_calculator(g_words_topics_dictionaries)        

In [178]:
g_words_topics_dictionaries

[{'suspect': 9,
  'victim': 7,
  'unknown': 4,
  'state': 1,
  'discover': 1,
  'life': 0,
  'letter': 0,
  'email': 0,
  'rpt': 0},
 {'suspect': 9,
  'victim': 7,
  'unknown': 4,
  'vehicle': 2,
  'remove': 3,
  'property': 3,
  'smash': 1,
  'window': 1,
  'car': 1,
  'fear': 1,
  'poe': 0,
  'give': 0,
  'beer': 0},
 {'suspect': 9, 'victim': 7, 'state': 1, 'shoot': 0, 'build': 0, 'bitch': 0},
 {'suspect': 9,
  'victim': 7,
  'unknown': 4,
  'vehicle': 2,
  'passenger': 0,
  'roll': 0,
  'door': 1,
  'cause': 0,
  'damage': 1,
  'rent': 0,
  'dent': 0,
  'rmv': 0,
  'deep': 0},
 {'suspect': 9,
  'victim': 7,
  'time': 1,
  'property': 3,
  'strike': 1,
  'become': 0,
  'dispute': 1,
  'enrage': 0,
  'work': 0,
  'several': 0,
  'hair': 0,
  'believe': 0,
  'street': 0}]

In [179]:
s

[0.3333333333333333,
 0.07692307692307693,
 0.3333333333333333,
 0.38461538461538464,
 0.5384615384615384]

In [180]:
g_sentences

[' unknown suspect pry hrd obj email victim state rpt victim discover life doesnt letter ',
 ' unknown suspects pry obj pry vehicle smash car window vehicle remove property poe victim fear give anyone property smash car beer ',
 ' victim trespass suspect state bitch build shoot ',
 ' unknown suspect rmvs rent vehicle roll passenger door damage victim vehicle around vehicle cause deep dent ',
 ' suspect become enrage victim dispute victim property dispute victim hair victim believe suspect work street strike victim several time ']

## Generate coherence score for table reported in the paper


In [251]:
# Create a new corpus, made of previously unseen documents.

t_df = pd.read_excel('./generated/table_content.xlsx')#, error_bad_lines=False,names=('NARRATIVE'))

t_df.columns

Index(['NARRATIVE'], dtype='object')

In [252]:

t_df['content'] = " " + t_df['NARRATIVE'].str.lower()+ " "

for key, value in word_replace_dic.items():
    t_df['content'] = t_df['content'].str.replace(key, value, 10)

In [253]:
# Convert to list
t_data = t_df.content.values.tolist()

# Remove Emails if available
t_data = [re.sub('\S*@\S*\s?', '', sent) for sent in t_data]
# Remove new line characters
t_data = [re.sub('\s+', ' ', sent) for sent in t_data]
# Remove distracting single quotes
t_data = [re.sub("\'", "", sent) for sent in t_data]
t_data_words = list(sent_to_words(t_data))

# Remove Stop Words
t_data_words_nostops = remove_stopwords(t_data_words)
# Form Bigrams
t_data_words_bigrams = make_bigrams(t_data_words_nostops)
# Do lemmatization keeping only noun, adj, vb, adv
t_data_lemmatized = lemmatization(t_data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(t_data_lemmatized[:1])

other_texts = t_data_lemmatized
other_corpus = [id2word.doc2bow(text) for text in other_texts]

[['suspect', 'continiously', 'call', 'text', 'victim', 'harrasse', 'message', 'victim', 'repeatedly', 'advise', 'suspect', 'stop', 'make', 'contact']]


In [258]:
t_sentences = t_data[:45]
unseen_doc = other_corpus[:45]

In [259]:
t_sentences

[' suspect continiously calls and text victim harrassing messages victim has repeatedly advised suspect to stop making contact ',
 ' suspect approached victim and asked victim to fight suspect hit victim multiple times victim defendedhimself and punched suspect suspect fled location southbound on kingsley from melrose ',
 ' suspect approached victims vehicle punctured victims tires with an unknown sharp tool and fled location in unknown dir ',
 ' unknown suspect involved in a fight inside club hit victim in back of head ',
 ' suspect entered location removed and concealed selected items suspect then fled locvia exit without paying for items ',
 ' a suspect entered res thru unlocked door removed victim rings rack when victim checked but fled location in unknown dir ',
 ' unknown suspect approached victim vehicle broke glass window to victim bedroom causing damage to bend ',
 ' suspects spray painted graffiti on victims balcony window hitting front walls ',
 ' suspect approached victim a

In [260]:
#vector = saved_model[unseen_doc]  # get topic probability distribution for a document
t_words_topics_dictionaries = format_topics_sentence(ldamodel=ldamallet, corpus=unseen_doc, texts=t_sentences)

In [262]:
s =  score_calculator(t_words_topics_dictionaries)        
pprint(s)

[0.6666666666666666,
 0.2727272727272727,
 0.16666666666666666,
 0.25,
 0.2727272727272727,
 0.07692307692307693,
 0.5,
 0.42857142857142855,
 0.5555555555555556,
 0.5714285714285714,
 -0.25,
 0.4,
 0.2,
 0.5,
 0.5,
 0.6666666666666666,
 0.625,
 0.5555555555555556,
 0.6,
 0.8571428571428571,
 0.8,
 0.3333333333333333,
 0.2,
 1.0,
 1.0,
 0.09090909090909091,
 0.125,
 0.0,
 0.29411764705882354,
 0.3333333333333333,
 0.3333333333333333,
 0.0,
 0.7142857142857143,
 0.38461538461538464,
 -0.7142857142857143,
 0.0,
 0.3333333333333333,
 0.0,
 -0.5,
 0.7391304347826086,
 0.6,
 0.625,
 0.5,
 0.0,
 0.5]
