In [None]:
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy as np

In [None]:
with open("english_texts.json", "r") as file:
    texts = json.load(file)

In [None]:
n_topics = [3, 5, 7, 9]
decays = [0.5, 0.7, 0.9]
models = {}
for n_topic in n_topics:
    for decay in decays:
        with open(f"model_{n_topic}_{decay}.p", "rb") as file:
            model = pickle.load(file)
            models[(n_topic, decay)] = model

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [None]:
scores_by_decays = defaultdict(list)
topics_by_decays = defaultdict(list)
for n_topic, decay in models:
    model = models[(n_topic, decay)]
    scores_by_decays[decay].append(model.score(X))
    topics_by_decays[decay].append(n_topic)

In [None]:
plt.figure(figsize=(12, 8))

for decay in scores_by_decays:
    scores = scores_by_decays[decay]
    topics = topics_by_decays[decay]
    plt.plot(topics, scores, label=str(decay))

plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.savefig("optimal_model.png")

In [None]:
from collections import defaultdict

counter = defaultdict(lambda: defaultdict(int))
n = X.shape[0]
transformed = model.transform(X)
for i in range(n):
    counted_words = defaultdict(int)
    text = texts[i]
    topic = transformed[i].argmax()
    for word in text.split():
        counter[topic][word] += 1

top_words_by_topic = dict()

for topic in counter:
    items = list(counter[topic].items())
    items.sort(key=lambda x: x[1], reverse=True)
    top_words_by_topic[topic] = items
    
top_words_by_topic

In [None]:
for topic in top_words_by_topic:
    with open(f"{topic}_words.txt", "w", encoding="utf-8") as file:
        for word, count in top_words_by_topic[topic]:
            file.write(f"{word.ljust(20)} {count}\n")

In [None]:
with open("ids_by_lems.json") as file:
    ids_by_lems = json.loads(file.read())


In [None]:
with open("english_texts_by_id.json") as file:
    english_texts_by_ids = json.loads(file.read())

In [None]:
import os
import re
import pandas as pd
from datetime import datetime

full_texts_by_id = {}

for file_path in os.listdir("cleaned_intercom_messages"):
    csv = pd.read_csv(f"cleaned_intercom_messages/{file_path}")
    csv = csv[csv['clean_body'].notna()]
    bodies = csv["clean_body"]
    dates = csv["created_date"]
    ids = zip(csv["key_intercomconversation"], csv["key_iteration"])
    for id, body, date in zip(ids, bodies, dates):
        m = re.search(" Selected category: (.+) Content: (.+) uid:", body)
        if m is not None:
            pass
        else:
            full_texts_by_id["_".join(map(str, id))] = (datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ"), body)


In [None]:
texts_by_topic = defaultdict(list)
for i, text in enumerate(texts):
    topic = transformed[i].argmax()
    id = ids_by_lems[text]
    full_text = full_texts_by_id[id]
    texts_by_topic[topic].append(full_text)
    
for topic in texts_by_topic:
    texts_by_topic[topic].sort(key=lambda x: x[0])
    texts_by_topic[topic] = list(map(lambda x: x[1], texts_by_topic[topic]))

In [None]:
texts_by_topic = dict(texts_by_topic)

for topic in texts_by_topic:
    with open(f"{topic}.txt", "w", encoding="utf-8") as file:
        for text in texts_by_topic[topic]:
            file.write(text)
            file.write("\n\n")