Running boilerplate LDA models from gensim for topic modeling.
Inspired by: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [1]:
import nltk
import spacy
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess

# Implicit Hate Dataset

In [2]:
# load dataset and tokenize
df = pd.read_csv('./../../data/implicithate_train.csv')
df['tokenized'] = df['text'].apply(lambda x: simple_preprocess(x))
df.head()

Unnamed: 0,text,label,tokenized
0,tax reform is why republicans have not turned ...,0,"[tax, reform, is, why, republicans, have, not,..."
1,speak out white people ! the most offensive & ...,0,"[speak, out, white, people, the, most, offensi..."
2,this is why i voted for trump ! betsy devos & ...,0,"[this, is, why, voted, for, trump, betsy, devo..."
3,podcast : the left is lashing out . white babi...,0,"[podcast, the, left, is, lashing, out, white, ..."
4,you don't believe spencer wants a 100 % white ...,0,"[you, don, believe, spencer, wants, white, us]"


In [3]:
# combine all the tokenized texts into one list
# all_tokenized_text = []
# for i in range(len(df)):
#     all_tokenized_text.extend(df['tokenized'][i])
# print(len(all_tokenized_text))

In [4]:
# build bigram and trigram models
bigram = gensim.models.Phrases(df['tokenized'], min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[df['tokenized']], threshold=100)
bigram_maker = gensim.models.phrases.Phraser(bigram)
trigram_maker = gensim.models.phrases.Phraser(trigram)

In [5]:
# remove stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
all_tokenized_text = [[word for word in doc if word not in stop_words] for doc in df['tokenized']]

In [6]:
def lemmatization(texts, allowed_postags = None):
    if allowed_postags is None:
        allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [7]:
# form bigrams
all_tokenized_text_bi = [bigram_maker[doc] for doc in all_tokenized_text]
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
all_tokenized_text_bi = lemmatization(all_tokenized_text_bi)
# drop all the empty lists and drop the rows with empty lists in the df
# all_tokenized_text_bi = [x for x in all_tokenized_text_bi if x != []]
# df = df[df['tokenized'] != []]
print(all_tokenized_text_bi[:10])

[['tax', 'reform', 'republican', 'turn', 'analysis', 'gregory', 'krieg'], ['speak', 'white', 'people', 'offensive', 'racist', 'comment', 'trump', 'people', 'get', 'bho', 'fan'], ['vote', 'try', 'change', 'thing', 'leave', 'stop'], ['podcast', 'leave', 'lash', 'white', 'baby', 'hatsanything', 'trigger', 'anti', 'white', 'animus'], ['believe', 'spencer', 'want', 'white'], ['new', 'boogieman', 'financial', 'threat', 'pull', 'petrodollar', 'immigration', 'white', 'hate', 'west'], ['crime'], ['imagine', 'memory', 'form', 'starve', 'cold', 'violently', 'abuse', 'last', 'lifetime'], ['marketplace', 'idea', 'girl', 'original', 'offer', 'offer', 'brief', 'dopamine', 'spurt', 'occasional', 'flirtation', 'assume', 'internet', 'act', 'accordingly'], ['see', 'question', 'make', 'nation', 'answer', 'accord', 'tiny', 'mind', 'white', 'neanderthal', 'scope', 'way', 'make', 'feel', 'good', 'go', 'ahead', 'shout', 'roof']]


In [8]:
dictionary = gensim.corpora.Dictionary(all_tokenized_text_bi)
corpus = [dictionary.doc2bow(doc) for doc in all_tokenized_text_bi]

In [9]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, random_state=100, update_every=1, chunksize=100, passes=10, per_word_topics=True)
lda.save('../../saved-models/lda_implicit.model')

In [10]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda.print_topics(num_topics=10, num_words=10))

[(0,
  '0.140*"white" + 0.093*"people" + 0.051*"race" + 0.045*"racist" + '
  '0.034*"black" + 0.029*"go" + 0.023*"think" + 0.022*"know" + '
  '0.016*"supremacist" + 0.013*"lie"'),
 (1,
  '0.052*"woman" + 0.043*"call" + 0.034*"group" + 0.027*"lol" + 0.025*"racist" '
  '+ 0.023*"child" + 0.022*"start" + 0.020*"non" + 0.020*"actually" + '
  '0.015*"watch"'),
 (2,
  '0.056*"get" + 0.054*"kill" + 0.047*"man" + 0.035*"back" + 0.028*"time" + '
  '0.028*"immigration" + 0.024*"tell" + 0.021*"video" + 0.019*"live" + '
  '0.016*"nationalism"'),
 (3,
  '0.057*"anti" + 0.040*"look" + 0.036*"long" + 0.026*"work" + 0.020*"find" + '
  '0.020*"leave" + 0.020*"much" + 0.018*"still" + 0.018*"tcot" + '
  '0.018*"today"'),
 (4,
  '0.084*"say" + 0.063*"want" + 0.026*"talk" + 0.022*"history" + '
  '0.021*"believe" + 0.019*"law" + 0.019*"fail" + 0.018*"arrest" + '
  '0.017*"help" + 0.017*"report"'),
 (5,
  '0.039*"make" + 0.038*"country" + 0.032*"come" + 0.028*"well" + '
  '0.027*"identity" + 0.022*"try" + 0.

In [11]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda, texts=all_tokenized_text_bi, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: -10.519641218370976


UMass Measure: https://mimno.infosci.cornell.edu/papers/mimno-semantic-emnlp.pdf

In [12]:
import pickle 
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda, corpus, dictionary)
LDAvis_prepared


  default_term_info = default_term_info.sort_values(


Now we can use the LDA model as part of a classification task.

In [13]:
train_vectors = []
for i in range(len(corpus)):
    top_topics = lda.get_document_topics(corpus[i], minimum_probability=0.0)
    train_vectors.append([top_topics[i][1] for i in range(10)])

In [14]:
X = np.array(train_vectors)
y = np.array(df['label'])
# run an SVM classifier
from sklearn.svm import LinearSVC
svm = LinearSVC(tol=1e-3, class_weight="balanced").fit(X, y)

In [15]:
val_df = pd.read_csv('./../../data/implicithate_val.csv')
val_df['tokenized'] = val_df['text'].apply(lambda x: simple_preprocess(x))
bigram_val = gensim.models.Phrases(val_df['tokenized'], min_count=5, threshold=100)
bigram_maker = gensim.models.phrases.Phraser(bigram_val)

In [16]:
# remove stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
val_df['tokenized'] = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in val_df['tokenized']]

In [17]:
val_bi = [bigram_maker[doc] for doc in val_df['tokenized']]
val_bi = lemmatization(val_bi)

In [18]:
val_dictionary = gensim.corpora.Dictionary(val_bi)
val_corpus = [val_dictionary.doc2bow(doc) for doc in val_bi]

In [19]:
val_vectors = []
for i in range(len(val_corpus)):
    top_topics = lda.get_document_topics(val_corpus[i], minimum_probability=0.0)
    val_vectors.append([top_topics[i][1] for i in range(10)])

In [20]:
# get an F1 score for svm predictions on the validation set
from sklearn.metrics import classification_report
predictions = svm.predict(val_vectors)
print(classification_report(val_df['label'], predictions, labels=[0, 1], target_names=['non-hate', 'hate']))

              precision    recall  f1-score   support

    non-hate       0.62      0.64      0.63      2150
        hate       0.36      0.34      0.35      1284

    accuracy                           0.53      3434
   macro avg       0.49      0.49      0.49      3434
weighted avg       0.52      0.53      0.52      3434

