# Topic Modeling - HateEval EN Training

## Data Cleaning 
- Tokenization
- Remove Stop Words
- Remove special characters - "?", "." , ".", "!", "*", ";", ":", "-"
- Lemmatization

In [1]:
# Importing the required libraries
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diptanu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diptanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Tokenize the tweets using Spacy English Tokenizer
import re
def tokenize(text, parser=English()):
    word_tokens_list = []
    tokens = parser(text)
    for token in tokens:
        # print(str(token.orth_).)
        if token.orth_.startswith('@') or token.orth_.isspace():
            continue
        elif token.like_url:
            word_tokens_list.append('URL')
        else:
            word_tokens_list.append(token.lower_)
    return word_tokens_list

In [3]:
# Remove the stopwords
def english_stop_words():
    stop_words_set = set(nltk.corpus.stopwords.words('english'))
    # Add common punctuations to remove
    stop_words_set.update(["?", "." , ".", "!", "*", ";", ":", "-"])
    return stop_words_set

In [4]:
# Lemmatize the text
def lemmatize(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    return lemma

In [5]:
# Calling all the methods to clean the data
def clean_data(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in english_stop_words()]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

### Import the Dataset and Apply cleaning steps

In [7]:
# Apply data cleaning steps on OLID - Training Dataset
import csv as csv
text_data = []
with open('hateval2019_en_train.csv') as input_file:
    csv_reader = csv.reader(input_file, delimiter=',')
    for row in csv_reader:
        if (row[2] == '1'):
            tokens = clean_data(row[1])
            text_data.append(tokens)

## Topic Modeling on the dataset

In [9]:
# Importing the required libraries
import gensim
from gensim import corpora

# Number of topics
num_of_topics = 10

In [10]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [11]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_of_topics, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.091*"#" + 0.012*"(" + 0.011*")" + 0.008*"buildthatwall" + 0.006*"n\'t" + 0.006*"URL" + 0.005*"word" + 0.005*"," + 0.005*"kitchen" + 0.004*"day"')
(1, '0.168*"#" + 0.020*"," + 0.019*"illegal" + 0.017*"URL" + 0.013*"buildthatwall" + 0.011*"immigration" + 0.010*"maga" + 0.009*"trump" + 0.008*"buildthewall" + 0.008*"immigrant"')
(2, '0.017*"refugee" + 0.012*"URL" + 0.011*"," + 0.010*"name" + 0.009*"get" + 0.009*"üíØ" + 0.008*"ya" + 0.008*"stop" + 0.007*"üòò" + 0.007*"want"')
(3, '0.050*"," + 0.022*"URL" + 0.015*"‚Äú" + 0.014*"n\'t" + 0.014*"‚Äù" + 0.012*"refugee" + 0.010*"rape" + 0.009*"people" + 0.008*"muslim" + 0.008*"woman"')
(4, '0.046*"," + 0.016*"URL" + 0.010*"go" + 0.008*"man" + 0.008*"one" + 0.008*"&" + 0.008*"refugee" + 0.007*"country" + 0.007*"#" + 0.007*"home"')
(5, '0.061*"#" + 0.033*"\'" + 0.030*"womensuck" + 0.016*"," + 0.014*"n\'t" + 0.010*"guy" + 0.009*"girl" + 0.009*"\'s" + 0.007*"get" + 0.007*"&"')
(6, '0.051*"bitch" + 0.037*"hoe" + 0.031*"üòÇ" + 0.024*"ass" + 0

In [12]:
# Visualize the topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)