# Topic Modeling - Davidson Data (Hate & Offensive)

## Data Cleaning 
- Tokenization
- Remove Stop Words
- Remove special characters - "?", "." , ".", "!", "*", ";", ":", "-"
- Lemmatization

In [1]:
# Importing the required libraries
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diptanu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diptanu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Tokenize the tweets using Spacy English Tokenizer
import re
def tokenize(text, parser=English()):
    word_tokens_list = []
    tokens = parser(text)
    for token in tokens:
        # print(str(token.orth_).)
        if token.orth_.startswith('@') or token.orth_.isspace() or token.orth_.startswith('&#'):
            continue
        elif token.like_url:
            word_tokens_list.append('URL')
        else:
            word_tokens_list.append(token.lower_)
    return word_tokens_list

In [3]:
# Remove the stopwords
def english_stop_words():
    stop_words_set = set(nltk.corpus.stopwords.words('english'))
    # Add common punctuations to remove
    stop_words_set.update(["?", "." , ".", "!", "*", ";", ":", "-"])
    return stop_words_set

In [4]:
# Lemmatize the text
def lemmatize(word):
    lemma = wordnet.morphy(word)
    if lemma is None:
        return word
    return lemma

In [5]:
# Calling all the methods to clean the data
def clean_data(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if token not in english_stop_words()]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

### Import the Dataset and Apply cleaning steps

In [6]:
# Apply data cleaning steps on OLID - Training Dataset
import csv as csv
text_data = []
with open('davidson_hate_off.csv') as input_file:
    csv_reader = csv.reader(input_file, delimiter=',')
    for row in csv_reader:
        if row[5] == '0' or row[5] == '1':
            tokens = clean_data(row[6])
            text_data.append(tokens)

## Topic Modeling on the dataset

In [7]:
# Importing the required libraries
import gensim
from gensim import corpora

# Number of topics
num_of_topics = 10

In [8]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [9]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_of_topics, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.076*""" + 0.057*"bitch" + 0.047*"rt" + 0.041*"," + 0.020*"\'s" + 0.018*"hoe" + 0.017*"like" + 0.013*"n\'t" + 0.012*"say" + 0.011*"\'m"')
(1, '0.121*"n\'t" + 0.060*"bitch" + 0.057*"ai" + 0.048*"u" + 0.046*"hoe" + 0.037*"nt" + 0.027*"ca" + 0.021*"shit" + 0.020*"nigga" + 0.019*"rt"')
(2, '0.031*"smh" + 0.023*"throw" + 0.019*"nigger" + 0.017*"days" + 0.016*"watch" + 0.016*"nigguh" + 0.015*"run" + 0.014*"thought" + 0.013*"remember" + 0.010*"dey"')
(3, '0.071*"bitch" + 0.048*"get" + 0.040*"hoe" + 0.036*"nigga" + 0.028*"," + 0.025*"rt" + 0.015*"go" + 0.015*"ass" + 0.013*"ya" + 0.012*"shit"')
(4, '0.049*"bitch" + 0.036*"rt" + 0.025*"\'re" + 0.014*"think" + 0.014*"URL" + 0.012*"cunt" + 0.012*"dyke" + 0.011*"never" + 0.010*"cheat" + 0.010*"\'ll"')
(5, '0.043*"faggot" + 0.017*"da" + 0.015*"happy" + 0.014*"n" + 0.011*"nicca" + 0.010*"niccas" + 0.010*"dat" + 0.009*"..." + 0.009*"wear" + 0.009*"pop"')
(6, '0.182*"#" + 0.156*"&" + 0.065*"rt" + 0.053*"bitch" + 0.034*"URL" + 0.034*"_" + 0.024*"h

### Visualize the topics and words

In [10]:
# Visualize the topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)