In [53]:
import pandas as pd
import sys
from top2vec import Top2Vec
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import punkt
from nltk.stem import WordNetLemmatizer
from collections import Counter
# import umap.umap_ as umap
# import umap.plot
nltk.download("stopwords")
import numpy as np
import json
import glob
import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
import spacy
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.gensim_models
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from lib.clean import split_rows_with_multiple_labels
from lib.gensim import generate_corpus, generate_gensim_vecs

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
def read_data():
    df = pd.read_csv("../data/classification/data/complaint_classification_labeled_data_1_31_2023.csv")
    return df

In [55]:
df = read_data()

In [56]:
################################################### preprocess ##################################################################

In [57]:
def clean_labels(df):
    df.loc[:, "label"] = (df.label.str.lower()
                                  .str.strip()
                                  .str.replace(r"#", "/", regex=False)
                                  .str.replace(r"internal misconduct\/administrative infraction",
                                               "internal misconduct; administrative infraction", regex=True)
    )
    return df

In [58]:
df = df.pipe(clean_labels).pipe(split_rows_with_multiple_labels)

In [59]:
def extract_targets(df):
    dfa = df
    dfa.loc[:, "target"] = (df.label.str.lower()
                                    .str.strip()
                                    .str.replace(r"internal misconduct; administrative infraction", "", regex=False)
    )
    dfa = dfa[~((dfa.target.fillna("") == ""))]
    dfa.loc[:, "target"] = dfa.target.str.replace(r"(.+)", "0", regex=True)

    extract_targets = df.label.str.lower().str.strip().str.extract(r"(internal misconduct; administrative infraction)")

    df.loc[:, "target"] = extract_targets[0].str.replace(r"(.+)", "1", regex=True)
    df = df[~((df.target.fillna("") == ""))]

    df = pd.concat([df, dfa], axis=0)
    df = df.rename(columns={"text": "allegation_desc"})
    return df

In [60]:
df = df.pipe(extract_targets)

In [61]:
def split_data(df):
    training_data, test_data = train_test_split(df, test_size=0.3)
    return training_data, test_data, df

In [62]:
training_data, test_data, og_df = split_data(df)

training_data.to_csv("../data/classification/training_data/training_data.csv", index=False)
test_data.to_csv("../data/classification/test_data/test_data.csv", index=False)

In [63]:
################################################### top2vec ##################################################################

In [64]:
def create_model(df):
    df = [x for x in df["allegation_desc"]]

    model = Top2Vec(
        df,
        ngram_vocab=True,
        speed="deep-learn",
        use_embedding_model_tokenizer=True,
        min_count=5,
    )
    return model

In [65]:
def model():
    top2vec = create_model(og_df)
    return top2vec

In [66]:
model = model()

2023-02-09 15:18:20,178 - top2vec - INFO - Pre-processing documents for training
2023-02-09 15:18:20,305 - top2vec - INFO - Creating joint document/word embedding
2023-02-09 15:18:52,677 - top2vec - INFO - Creating lower dimension embedding of documents
2023-02-09 15:18:58,379 - top2vec - INFO - Finding dense areas of documents
2023-02-09 15:18:58,441 - top2vec - INFO - Finding topics


In [67]:
model.save("models/top2vec_train_model")

In [68]:
############################################################ bertopic #################################################################

In [69]:
docs = og_df.allegation_desc

In [70]:
docs = json.loads(docs.to_json(orient='records'))

In [71]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
embeddings = np.array(embeddings)
embeddings = pd.DataFrame(embeddings).to_csv("vectors/bert_train_vecs.csv", index=False)

Batches: 100%|██████████| 49/49 [00:19<00:00,  2.57it/s]


In [72]:
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", vectorizer_model=vectorizer_model, diversity=0.2)

In [73]:
topic, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.save("models/bert_train_model")

In [None]:
############################################################ gensim ##############################################################################

In [None]:
docs = og_df.allegation_desc

In [None]:
train_corpus, id2word = generate_corpus(docs)

In [None]:
gensim_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
gensim_model.save('models/gensim_train.model')

In [None]:
gensim_vecs = generate_gensim_vecs(docs, gensim_model, train_corpus)
gensim_vecs = np.array(gensim_vecs)
gensim_vecs = pd.DataFrame(gensim_vecs).to_csv("vectors/gensim_train_vecs.csv", index=False)