In [55]:
import pandas as pd
import sys
from top2vec import Top2Vec
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import punkt
from nltk.stem import WordNetLemmatizer
from collections import Counter
# import umap.umap_ as umap
# import umap.plot
nltk.download("stopwords")
import numpy as np
import json
import glob
import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
import spacy
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim_models
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
def read_data():
    df = pd.read_csv("../data/training_data/complaint_classification_labeled_data_1_31_2023.csv")
    return df

In [57]:
df = read_data()

In [58]:
def split_rows_with_multiple_labels(df):
    df.loc[:, "label"] = (df.label.str.lower()
                                  .str.strip()
                                  .str.replace(r"#", "/", regex=False)
                                  .str.replace(r"internal misconduct\/administrative infraction",
                                               "internal misconduct; administrative infraction", regex=True)
    )
    df = (
        df.drop("label", axis=1)
        .join(
            df["label"]
            .str.split("/", expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .rename("label"),
            how="outer",
        )
        .reset_index(drop=True)
    )
    return df

In [59]:
df = df.pipe(split_rows_with_multiple_labels)

In [60]:
def extract_labels(df):
    dfa = df
    dfa.loc[:, "target"] = (df.label.str.lower()
                                    .str.strip()
                                    .str.replace(r"internal misconduct; administrative infraction", "", regex=False)
    )
    dfa = dfa[~((dfa.target.fillna("") == ""))]
    dfa.loc[:, "target"] = dfa.target.str.replace(r"(.+)", "0", regex=True)

    extract_targets = df.label.str.lower().str.strip().str.extract(r"(internal misconduct; administrative infraction)")

    df.loc[:, "target"] = extract_targets[0].str.replace(r"(.+)", "1", regex=True)
    df = df[~((df.target.fillna("") == ""))]

    df = pd.concat([df, dfa], axis=0)
    df = df.rename(columns={"text": "allegation_desc"})
    return df

In [61]:
df = df.pipe(extract_labels)

In [62]:
def split_data(df):
    training_data, test_data = train_test_split(df, test_size=0.3)
    return training_data, test_data, df

In [63]:
training_data, test_data, og_df = split_data(df)

training_data.to_csv("training_data/training_data.csv", index=False)
test_data.to_csv("test_data/test_data.csv", index=False)

In [64]:
training_data.target.value_counts()

0    941
1    139
Name: target, dtype: int64

In [65]:
test_data.target.value_counts()

0    379
1     84
Name: target, dtype: int64

In [66]:
################################################### top2vec ##################################################################################

In [67]:
def create_model(df):
    # unique = [x for x in df["allegation_topic_uid"]]
    df = [x for x in df["allegation_desc"]]

    model = Top2Vec(
        df,
        ngram_vocab=True,
        speed="deep-learn",
        use_embedding_model_tokenizer=True,
        min_count=5,
        # document_ids=unique
    )
    return model

In [68]:
def model():
    top2vec = create_model(og_df)
    return top2vec

In [69]:
model = model()

2023-02-02 11:12:54,725 - top2vec - INFO - Pre-processing documents for training
2023-02-02 11:12:54,907 - top2vec - INFO - Creating joint document/word embedding
2023-02-02 11:13:53,831 - top2vec - INFO - Creating lower dimension embedding of documents
2023-02-02 11:14:04,948 - top2vec - INFO - Finding dense areas of documents
2023-02-02 11:14:05,072 - top2vec - INFO - Finding topics


In [70]:
# if len(model.get_topic_sizes()) > 1:
#     topic_words, word_scores, topic_nums = model.get_topics()
#     for words, scores, num in zip(topic_words, word_scores, topic_nums):
#         print(num)
#         print(f"Words: {words}")

In [71]:
# topic_sizes, top_nums = model.get_topic_sizes()
# print(topic_sizes)
# print(top_nums)

In [72]:
# documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=1, num_docs=10)

# for doc, score, doc_id in list(zip(documents, document_scores, document_ids)):
#     print(f"Document: {doc_id}, Score: {score}")
#     print("--------------------")
#     print(doc)
#     print("--------------------")

In [73]:
# documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=0, num_docs=20)

# ents = {(doc, score) for doc, score in list(zip(documents, document_scores))}
# df = pd.DataFrame(ents, columns=["allegation_desc", "score"])
# print(df)
# df.loc[:, "topic"] = "9"

In [74]:
model.save("models/top2vec_train_model")

In [75]:
# model.generate_topic_wordcloud(0)

In [76]:
# model.topic_words

In [77]:
# model.topic_words[0]

In [78]:
# topic_words, word_scores, topic_nums = model.get_topics(0)
# for words, scores, nums in zip(topic_words, word_scores, topic_nums):
#   print("Topic Number: ",nums)
#   print(f"Words: {words}")
#   print("\n")

In [79]:
# topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["inmate"], num_topics=0)
# for word, w_score, topic, t_score in list(zip(topic_words, word_scores, topic_scores, topic_nums)):
#     print(f"Word: \n{word}")
#     print("--------------------")
#     print(f"Word Score \n{w_score}")
#     print("--------------------")
#     print(f"Topic Score: \n{topic}")
#     print("--------------------")
#     print (f"Topic # \n{t_score}")
#     print("--------END---------")

In [80]:
# model = Top2Vec.load("models/noso")

# umap_args = {
#     "n_neighbors": 15,
#     "n_components": 2, # 5 -> 2 for plotting 
#     "metric": "cosine",
# }
# umap_model = umap.UMAP(**umap_args).fit(model.topic_vectors)
# umap.plot.points(umap_model, labels=model.doc_top_reduced)

In [81]:
############################################################ bertopic #################################################################

In [82]:
docs = og_df.allegation_desc

In [83]:
docs = json.loads(docs.to_json(orient='records'))

In [84]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
embeddings = np.array(embeddings)
embeddings = pd.DataFrame(embeddings).to_csv("vecs/bert_train_vecs.csv", index=False)

Batches: 100%|██████████| 49/49 [00:31<00:00,  1.58it/s]


In [None]:
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", vectorizer_model=vectorizer_model, diversity=0.2)

In [None]:
topic, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.save("models/bert_train_model")

In [None]:
# topic_model.get_topic_info()

In [None]:
# topic_model.get_topic(3)

In [None]:
# topic_model.get_representative_docs(3)

In [None]:
# topic_model.visualize_topics()

In [None]:
# topic_model.visualize_barchart()

In [None]:
# df_bert = pd.DataFrame({"topic": topic, "documents": docs})

In [None]:
# df_bert

In [None]:
############################################################ gensim ##############################################################################

In [None]:
train_docs = og_df.allegation_desc

In [None]:
def lemmatization(descs, allowed_pos_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    final_text = []
    for desc in descs:
        doc = nlp(desc)
        new_text = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
        final_text.append(new_text)
    return (final_text)

In [None]:
lemmatized_texts = lemmatization(docs)

In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

In [None]:
data_words = gen_words(lemmatized_texts)

In [None]:
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return list(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return list(trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_words)

In [None]:
id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

train_corpus = [id2word.doc2bow(text) for text in texts]
# print(train_corpus[0][0:20])

tdidf = TfidfModel(train_corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tdif = []

for i in range(0, len(train_corpus)):
    bow = train_corpus[i]
    low_value_words = []
    tdif_ids = [id for id, value in tdidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tdidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tdif
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tdif = [id for id in bow_ids if id not in tdif_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tdif]
    train_corpus[i] = new_bow

In [None]:
# id2word = corpora.Dictionary(data_words)

# corpus = []
# for text in data_words:
#     new = id2word.doc2bow(text)
#     corpus.append(new)

In [None]:
gensim_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                        #    per_word_topics=True,
                                           alpha="auto")
gensim_model.save('models/gensim_train.model')

In [None]:
# gensim_model.print_topics(5, num_words=20)[:10]

In [None]:
def gensim_vecs():
    gensim_train_vecs = []
    for i in range(len(train_docs)):
        top_topics = gensim_model.get_document_topics(train_corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(20)]
        topic_vec.extend([len(train_docs.iloc[i])]) 
        gensim_train_vecs.append(topic_vec)
    return gensim_train_vecs

In [None]:
gensim_vecs = gensim_vecs()
gensim_vecs = np.array(gensim_vecs)
gensim_vecs = pd.DataFrame(gensim_vecs).to_csv("vecs/gensim_train_vecs.csv", index=False)

In [None]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(gensim_model, train_corpus, id2word, mds="mmds", R=30)
# vis