In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import ast
import numpy as np
from coherence import eval_coherence

In [3]:
with open('sums.txt','r') as f:
    sums = ast.literal_eval(f.read())

### Modeling

In [4]:
cv = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')

In [5]:
dtm = cv.fit_transform(sums)

In [6]:
pd.DataFrame(cv.get_feature_names_out(), columns=['Words'])

Unnamed: 0,Words
0,abandon
1,abduct
2,abet
3,ability
4,able
...,...
3249,yugoslavia
3250,zambia
3251,zamboanga
3252,zealand


In [7]:
LDA = LatentDirichletAllocation(n_components=5,random_state=42)

In [8]:
# This can take awhile, we're dealing with a large amount of documents!
LDA.fit(dtm)

LatentDirichletAllocation(n_components=5, random_state=42)

In [9]:
myDict = {}

In [10]:
word_distribution = LDA.components_ / LDA.components_.sum(axis=1)[:, np.newaxis]

In [11]:
cnt = 0
for index,topic in enumerate(word_distribution):
    print(f'THE TOP 30 WORDS FOR TOPIC #{index}')
    myDict[cnt] = [[cv.get_feature_names_out()[i], round(topic[i],3)] for i in topic.argsort()]
    print(list(reversed([cv.get_feature_names_out()[i] for i in topic.argsort()[-50:]])))
    print('\n')
    cnt += 1

THE TOP 30 WORDS FOR TOPIC #0
['smuggle', 'vessel', 'migrant', 'crew', 'australia', 'later', 'siev', 'venture', 'case', 'mr', 'charge', 'member', 'board', 'indonesian', 'convict', 'states', 'involvement', 'australian', 'intercept', 'indonesia', 'passenger', 'united', 'accuse', 'near', 'authority', 'carry', 'relate', 'island', 'involve', 'reef', 'ashmore', 'paid', 'time', 'apprehend', 'afghan', 'receive', 'left', 'captain', 'involves', 'boat', 'christmas', 'day', 'refer', 'mexico', 'appeal', 'person', 'june', 'arrival', 'background', 'alien']


THE TOP 30 WORDS FOR TOPIC #1
['court', 'traffic', 'criminal', 'human', 'woman', 'case', 'police', 'person', 'child', 'defendant', 'accuse', 'act', 'crime', 'prostitution', 'state', 'charge', 'work', 'applicant', 'law', 'victim', 'investigation', 'order', 'year', 'offence', 'group', 'code', 'exploitation', 'section', 'make', 'commit', 'minor', 'article', 'sentence', 'evidence', 'report', 'force', 'allege', 'plaintiff', 'worker', 'labour', 'file',

In [12]:
sums_list = [item.split() for item in sums]
eval_coherence(myDict, sums_list)

Evaluating topic coherence...
Done



0.622649557633958

### Store results

In [13]:
results = pd.DataFrame(columns = np.array([['Topic_{}_words'.format(str(i)), 'Topic_{}_significance'.format(str(i))] for i in range(5)]).flatten())

In [14]:
results

Unnamed: 0,Topic_0_words,Topic_0_significance,Topic_1_words,Topic_1_significance,Topic_2_words,Topic_2_significance,Topic_3_words,Topic_3_significance,Topic_4_words,Topic_4_significance


In [15]:
for i in range(5):
    results['Topic_{}_words'.format(str(i))] = np.flip(myDict[i][-50:]).T[1]
    results['Topic_{}_significance'.format(str(i))] = np.flip(myDict[i][-50:]).T[0]

In [16]:
results.head()

Unnamed: 0,Topic_0_words,Topic_0_significance,Topic_1_words,Topic_1_significance,Topic_2_words,Topic_2_significance,Topic_3_words,Topic_3_significance,Topic_4_words,Topic_4_significance
0,smuggle,0.045,court,0.016,migrant,0.044,appellant,0.028,victim,0.049
1,vessel,0.042,traffic,0.013,defendant,0.027,migrant,0.018,defendant,0.042
2,migrant,0.034,criminal,0.011,smuggle,0.018,accuse,0.017,work,0.02
3,crew,0.024,human,0.01,criminal,0.013,france,0.014,woman,0.014
4,australia,0.024,woman,0.009,vessel,0.011,appeal,0.013,prostitution,0.009


In [17]:
results.to_excel('LDA_results.xlsx', index=False)