In [15]:
from models.nmf_model import NMFModel
from models.lda_model import LDAModel
from models.berttopic_model import BERTopicModel
from utils.preprocessing import DataPreprocessor
from utils.data_structures import InputData
from utils.visualizer import visualise_topics_overtime

import pandas as pd
df = pd.read_csv("data/askmen_df.csv")


In [16]:
dp = DataPreprocessor(lematize = True,
                      stem = False,
                      min_word_len = 3)

df.head()

Unnamed: 0,title,score,num_comments,selftext,created
0,Guys who don’t re-rack their weights at the gy...,3,15,,2020-01-02
1,What are some of your favorite lesser knowing ...,3,5,Known *,2020-01-02
2,"Men dating accomplished women, how do you deal...",5502,1011,I had someone tell me the fact that I'm a scie...,2020-01-02
3,How to try out gay relationship?,1,6,"So, I admitted to myself that I am at least bi...",2020-01-02
4,How did you get your first black eye?,1,24,,2020-01-02


In [17]:
df = dp.preprocess_dataframe(df,
                            text_column = ["title", "selftext"],
                            dest_column = "processed_text",
                            remove_empty_rows = True)

In [18]:
inputdata = InputData()
inputdata.texts_from_df(df, column = "processed_text")

In [19]:
models = [NMFModel(), LDAModel()]
output = []

In [20]:
for model in models:
    model.fit(inputdata)
    output.append(model.get_output())

In [21]:
models[0].save("nmfmodel.obj")

In [22]:
models[1].save("ldamodel.obj")

In [23]:
ldamodel = models[1]

In [29]:
from metrics.coherence_metric import *
from metrics.diversity_metric import *
from metrics.significance_metric import *
from metrics.similarity_metric import *

model_names = ["NMF", "LDA"]
metrics = [
    KLUniformMetric(),
    KLBackgroundMetric(),
    RBOMetric(),
    WordEmbeddingPairwiseSimilarityMetric(),
    WordEmbeddingCentroidSimilarityMetric(),
    PairwiseJacckardSimilarityMetric(),
    UMassCoherenceMetric(),
    CVCoherenceMetric(),
    CUCICoherenceMetric(),
    CNPMICoherenceMetric(),
    WECoherencePairwiseMetric(),
    WECoherenceCentroidMetric(),
    TopicDiversityMetric(),
    InvertedRBOMetric(),
    LogOddsRatioMetric(),
    WordEmbeddingsInvertedRBOMetric(),
    WordEmbeddingsInvertedRBOCentroidMetric()
]

metric_names = [metric.name for metric in metrics]

scores = []

for model in models:
    print("starting next model")
    scores.append([metric.evaluate(inputdata, model.get_output()) for metric in metrics])




starting next model
[[1.42559158 1.15384741 0.9895627  0.921887   0.88188083 0.83919344
  0.83416632 0.80050401 0.65779603 0.63694649]
 [2.88057802 1.85764247 0.22069899 0.21271024 0.18056423 0.13952386
  0.13484399 0.12604011 0.10868713 0.106628  ]
 [2.45259224 1.35816831 0.69648774 0.57090428 0.53366092 0.48822916
  0.28241637 0.26940128 0.25222523 0.24861497]
 [2.78597239 0.31571064 0.30575249 0.29155686 0.28345414 0.25397856
  0.25076605 0.23655785 0.22767412 0.16086614]
 [2.73262792 0.95616006 0.40306016 0.37797696 0.24414762 0.23863779
  0.22603728 0.22313286 0.19664107 0.16547473]]
starting next model
[[0.02230713 0.01455092 0.00853271 0.00835876 0.0069432  0.00639937
  0.00613639 0.00588786 0.00569172 0.00564007]
 [0.07062649 0.01452763 0.01368842 0.01187118 0.01152927 0.01060639
  0.01045204 0.0090241  0.00815454 0.00783053]
 [0.01491459 0.01394992 0.01297728 0.01234419 0.01210074 0.01031286
  0.0096725  0.00952605 0.00754175 0.00646536]
 [0.03814893 0.01796398 0.01210118 0.01

In [32]:
scores_df = pd.DataFrame(scores, index=model_names, columns=metric_names)
scores_df

Unnamed: 0,KL Uniform,KL Background,RBO,Word Embedding Pairwise Similarity,Word Embedding Centroid Similarity,Pairwise Jacckard Similarity,UMass Coherence,CV Coherence,CUCI Coherence,CNPMI Coherence,WE Pairwise Coherence,WE Centroid Coherence,Topic Diversity,Inverted RBO,Log Odds Ratio,Word Embeddings Inverted RBO,Word Embeddings Inverted RBO Centroid
NMF,0.473826,0.691178,0.033807,0.162493,0.586106,0.037427,-2.355675,0.511881,0.273828,0.040624,0.005662,0.921946,0.86,0.966193,0.734598,0.408867,0.835784
LDA,0.140819,0.687559,0.039904,0.151579,0.559819,0.036898,-3.353137,0.459602,-1.046344,-0.020711,0.007506,0.901096,0.78,0.960096,0.264136,0.412916,0.837217


In [34]:
scores_df.to_csv("scores_df.csv")

In [35]:
import pickle
with open("inputdata.obj", "wb") as f:
    pickle.dump(inputdata, f)