In [1]:
import pandas as pd
import json
import spacy
from sentence_transformers import SentenceTransformer
import gensim
import gensim.corpora as corpora
from gensim.models import TfidfModel
import numpy as np
from top2vec import Top2Vec

In [2]:
training_data = pd.read_csv("training_data/training_data.csv")
test_data = pd.read_csv("test_data/test_data.csv")

In [3]:
bert_train_vecs = pd.read_csv("vecs/bert_train_vecs.csv")
bert_train_vecs = bert_train_vecs.values.tolist()

bert_test_docs = test_data.allegation_desc
bert_test_docs = json.loads(bert_test_docs.to_json(orient='records'))

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
bert_test_vecs = sentence_model.encode(bert_test_docs, show_progress_bar=True)
bert_test_vecs = np.array(bert_test_vecs)
bert_test_vecs = pd.DataFrame(bert_test_vecs).to_csv("vecs/bert_test_vecs.csv", index=False)

Batches: 100%|██████████| 15/15 [00:11<00:00,  1.26it/s]


In [4]:
gensim_train_vecs = pd.read_csv("vecs/gensim_train_vecs.csv")
gensim_train_vecs = gensim_train_vecs.values.tolist()

gensim_model = gensim.models.ldamodel.LdaModel.load("models/gensim_train.model")

gensim_test_docs = test_data.allegation_desc

def lemmatization(descs, allowed_pos_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    final_text = []
    for desc in descs:
        doc = nlp(desc)
        new_text = " ".join([token.lemma_ for token in doc if token.pos_ in allowed_pos_tags])
        final_text.append(new_text)
    return (final_text)

lemmatized_texts = lemmatization(gensim_test_docs)

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return list(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return list(trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_words)

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

test_corpus = [id2word.doc2bow(text) for text in texts]

tdidf = TfidfModel(test_corpus, id2word=id2word)

low_value = 0.03
words = []
words_missing_in_tdif = []

for i in range(0, len(test_corpus)):
    bow = test_corpus[i]
    low_value_words = []
    tdif_ids = [id for id, value in tdidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tdidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tdif
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tdif = [id for id in bow_ids if id not in tdif_ids]

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tdif]
    test_corpus[i] = new_bow

def get_test_vecs():
    gensim_test_vecs = []
    for i in range(len(gensim_test_docs)):
        top_topics = gensim_model.get_document_topics(test_corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(10)]
        topic_vec.extend([len(gensim_test_docs.iloc[i])]) # length review
        gensim_test_vecs.append(topic_vec)
    return gensim_test_vecs


gensim_test_vecs = get_test_vecs()
gensim_test_vecs = np.array(gensim_test_vecs)
gensim_test_vecs = pd.DataFrame(gensim_test_vecs).to_csv("vecs/gensim_test_vecs.csv", index=False)

In [5]:
t2v_train_model = Top2Vec.load("models/top2vec_train_model")
t2v_train_vecs = t2v_train_model.document_vectors

def convert_test_data_to_list(data):
    t2v_test_docs = [x for x in data["allegation_desc"]]
    return t2v_test_docs

t2v_test_docs = convert_test_data_to_list(test_data)
t2v_test_model = Top2Vec(t2v_test_docs, embedding_model_path="models/top2vec_train_model")
t2v_test_vecs = t2v_test_model.document_vectors

t2v_test_vecs = np.array(t2v_test_vecs)
t2v_test_vecs = pd.DataFrame(t2v_test_vecs).to_csv("vecs/t2v_test_vecs.csv", index=False)

2023-02-02 11:05:22,558 - top2vec - INFO - Pre-processing documents for training
2023-02-02 11:05:22,613 - top2vec - INFO - Creating joint document/word embedding
2023-02-02 11:05:23,703 - top2vec - INFO - Creating lower dimension embedding of documents
2023-02-02 11:05:38,835 - top2vec - INFO - Finding dense areas of documents
2023-02-02 11:05:38,858 - top2vec - INFO - Finding topics
