# Topic Modeling on Tweets with Hate Speech

In [30]:
import numpy as np

from dataset.dataset import Dataset
from constants import *

from gensim.models import Nmf, TfidfModel
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [2]:
RANDOM_SEED = 0

In [3]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=split_sizes_cleaned)
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


## TFIDF + NMF using Gensim

In [15]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_train_hate = X_train[Y_train == 1]
X_train_hate_tokenized = [sentence.split() for sentence in X_train_hate]

In [38]:
dct = Dictionary(X_train_hate_tokenized)
corpus = [dct.doc2bow(text) for text in X_train_hate_tokenized]

In [23]:
# Fit TFIDF Model
model = TfidfModel(corpus)
X_train_hate_tfidf = [model[corpus_bow] for corpus_bow in corpus]

In [53]:
# Fit NMF Model
nmf = Nmf(X_train_hate_tfidf, num_topics = 10)
hate_topics_per_tweet = nmf.get_document_topics(bow=X_train_hate_tfidf, normalize=True)

In [54]:
hate_topics = nmf.show_topics(num_topics=10,num_words=10,normalize=True)

In [60]:
topics_with_score = nmf.top_topics(corpus=X_train_hate_tfidf, texts=X_train_hate_tokenized, dictionary=dct, window_size=None, coherence='u_mass', topn=10, processes=-1)

In [77]:
for j in range(10):
    for i in range(10):
        topics_with_score[j][0][i] = list(topics_with_score[j][0][i])
        word_id = int(topics_with_score[j][0][i][1])
        word = dct.get(word_id)
        topics_with_score[j][0][i][1] = word
    

In [78]:
topics_with_score

[([[0.032076789927225235, 'nognog'],
   [0.031182506455621317, 'ad'],
   [0.016756622746141626, 'tv'],
   [0.016321195907105897, 'pandak'],
   [0.012547773072852257, 'binay'],
   [0.011298152539111221, 'nung'],
   [0.011259820641336668, 'campaign'],
   [0.00902197463244003, 'tangina'],
   [0.008907872594437889, 'ads'],
   [0.0076647133151455035, 'hahahaha']],
  -4.426310562091763),
 ([[0.03420426970510757, 'commercial'],
   [0.019761264829294408, 'nako'],
   [0.019159798423454028, 'hay'],
   [0.014745585400393781, 'hahaha'],
   [0.014201001220966975, 'puta'],
   [0.011350157757941966, 'ha'],
   [0.010143296159668731, 'nognog'],
   [0.010018815754758235, 'binay'],
   [0.008113258725498472, 'grabe'],
   [0.008099476277440518, 'vp']],
  -5.115252430149921),
 ([[0.023018250901399306, 'talaga'],
   [0.009781646169791111, 'mo'],
   [0.009459497811197384, 'mar'],
   [0.009099935512251148, 'eh'],
   [0.008335307016861063, 'po'],
   [0.007319312629563542, 'oh'],
   [0.007105989659676381, 'bobo'