# Baseline Model
## Topic Modeling LDA

In [1]:
import preprocess
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data = preprocess.read_encrypted_data()
df = preprocess.reformat_data(data)

In [3]:
# vectorizer = TfidfVectorizer(lowercase = True, min_df = 10, max_df = 0.2, ngram_range=(1, 4), stop_words="english")
vectorizer = CountVectorizer(lowercase = True, min_df = 10, max_df = 0.2, stop_words="english", ngram_range=(1, 4))
Text_X = vectorizer.fit_transform(df["Event_Remarks_Text"])

In [4]:
lda = LatentDirichletAllocation(n_components=100, max_iter=100, learning_method='online', 
                                random_state=0, verbose=2, evaluate_every=1)

theta = lda.fit_transform(Text_X.A)

iteration: 1 of max_iter: 100, perplexity: 682229.7308
iteration: 2 of max_iter: 100, perplexity: 113600.1098
iteration: 3 of max_iter: 100, perplexity: 27397.2905
iteration: 4 of max_iter: 100, perplexity: 9059.9775
iteration: 5 of max_iter: 100, perplexity: 3942.5133
iteration: 6 of max_iter: 100, perplexity: 2169.9480
iteration: 7 of max_iter: 100, perplexity: 1437.6708
iteration: 8 of max_iter: 100, perplexity: 1093.1183
iteration: 9 of max_iter: 100, perplexity: 913.7514
iteration: 10 of max_iter: 100, perplexity: 812.8543
iteration: 11 of max_iter: 100, perplexity: 755.2321
iteration: 12 of max_iter: 100, perplexity: 719.0780
iteration: 13 of max_iter: 100, perplexity: 696.5933
iteration: 14 of max_iter: 100, perplexity: 681.5640
iteration: 15 of max_iter: 100, perplexity: 670.3489
iteration: 16 of max_iter: 100, perplexity: 662.8335
iteration: 17 of max_iter: 100, perplexity: 657.0895
iteration: 18 of max_iter: 100, perplexity: 652.5670
iteration: 19 of max_iter: 100, perplexity

In [5]:
beta = lda.components_/np.sum(lda.components_,axis=1,keepdims=True)
max_words = np.argsort(-beta, axis=1)[:,:15]
features = vectorizer.get_feature_names_out()

for i in range(lda.components_.shape[0]):
    print(f"topic: {i}")
    print(f"psuedo count: {lda.components_[i].sum()}")
    print([features[ind] for ind in max_words[i] if lda.components_[i, ind] >= 5])
    print('\n')

topic: 0
psuedo count: 1280.4867039071505
['changed', 'miscellaneous', 'priority', 'changed 2023 01', 'event priority changed 2023', 'priority changed 2023 01', 'event priority changed', 'priority changed 2023', 'changed 2023', 'event priority', 'priority changed', 'type', 'miscellaneous miscellaneous 2023 01', 'miscellaneous 2023 01', 'event type']


topic: 1
psuedo count: 10.473884895880023
[]


topic: 2
psuedo count: 501.2125576313738
['timers', '07', '2023 01 07', '01 07', 'doing', 'backup', 'door', 'required', 'advise', 'area', 'd121', 'follow', 'redacted timers', 'door redacted', 'needed']


topic: 3
psuedo count: 10.473982915196427
[]


topic: 4
psuedo count: 10.474214532516903
[]


topic: 5
psuedo count: 10.474297309247426
[]


topic: 6
psuedo count: 10.473872177291135
[]


topic: 7
psuedo count: 10.474284296805418
[]


topic: 8
psuedo count: 3408.1429210585984
['request', 'scene', 'caller scene', 'urgent', 'urgent request', 'complaint', 'person', 'incident', 'chief', 'suspect'