In [8]:
import ast
import pandas as pd
import numpy as np
import spacy
from gensim import corpora
from gensim.models import LsiModel
from coherence import eval_coherence

In [2]:
with open("sums.txt","r") as f:
    sums = ast.literal_eval(f.read())

In [3]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

In [4]:
def create_gensim_lsa_model(doc_clean,number_of_topics):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
#     print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

### Modeling

In [5]:
sums_list = [item.split() for item in sums]

In [9]:
LSA = create_gensim_lsa_model(sums_list, 5)

In [10]:
myDict = {}

In [11]:
for i in range(5):
    print(f'THE TOP 30 WORDS FOR TOPIC #{i}')
    myDict[i] = [[item.split('*')[1].replace('"','').strip(), float(item.split('*')[0])] for item in LSA.print_topics(num_words = 50)[i][1].split('+')]
    print(np.array(myDict[i]).T[0])
    print('\n')

THE TOP 30 WORDS FOR TOPIC #0
['defendant' 'victim' 'work' 'accuse' 'woman' 'migrant' 'money' 'police'
 'court' 'prostitution' 'tell' 'sexual' 'transport' 'use' 'day' 'time'
 'force' 'prostitute' 'year' 'make' 'house' 'criminal' 'provide' 'smuggle'
 'person' 'receive' 'travel' 'case' 'girl' 'passport' 'charge' 'traffic'
 'service' 'month' 'sex' 'ms' 'return' 'client' 'order' 'mr' 'later'
 'state' 'know' 'paid' 'arrest' 'home' 'information' 'stay' 'evidence'
 'group']


THE TOP 30 WORDS FOR TOPIC #1
['accuse' 'defendant' 'victim' 'migrant' 'drug' 'work' 'time' 'make'
 'smuggle' 'tell' 'house' 'money' 'girl' 'complainant' 'police' 'client'
 'person' 'state' 'sex' 'sexual' 'day' 'know' 'child' 'woman' 'later'
 'testify' 'case' 'stay' 'say' 'vessel' 'man' 'evidence' 'ms' 'use' 'mr'
 'appellant' 'return' 'charge' 'court' 'arrive' 'mother' 'live' 'criminal'
 'want' 'anonymous' 'officer' 'left' 'group' 'home' 'act']


THE TOP 30 WORDS FOR TOPIC #2
['migrant' 'victim' 'smuggle' 'defendant' 'ac

In [12]:
eval_coherence(myDict, sums_list)

Evaluating topic coherence...
Done



0.28511379073924903

### Store results

In [13]:
results = pd.DataFrame(columns = np.array([['Topic_{}_words'.format(str(i)), 'Topic_{}_significance'.format(str(i))] for i in range(5)]).flatten())

In [14]:
results

Unnamed: 0,Topic_0_words,Topic_0_significance,Topic_1_words,Topic_1_significance,Topic_2_words,Topic_2_significance,Topic_3_words,Topic_3_significance,Topic_4_words,Topic_4_significance


In [15]:
for i in range(5):
    results['Topic_{}_words'.format(str(i))] = np.array(myDict[i]).T[0]
    results['Topic_{}_significance'.format(str(i))] = np.array(myDict[i]).T[1]

In [16]:
results.head()

Unnamed: 0,Topic_0_words,Topic_0_significance,Topic_1_words,Topic_1_significance,Topic_2_words,Topic_2_significance,Topic_3_words,Topic_3_significance,Topic_4_words,Topic_4_significance
0,defendant,0.569,accuse,-0.625,migrant,-0.569,defendant,-0.479,victim,-0.582
1,victim,0.531,defendant,0.394,victim,0.381,accuse,-0.451,defendant,0.391
2,work,0.208,victim,0.309,smuggle,-0.291,work,0.302,woman,0.29
3,accuse,0.15,migrant,-0.149,defendant,-0.241,woman,0.282,anonymous,0.25
4,woman,0.125,drug,-0.141,accuse,0.215,ms,0.244,work,0.237


In [17]:
results.to_excel('LSA_results.xlsx', index=False)