In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Better Profanity
# from better_profanity import profanity # Censors text

# Gensim
from gensim import matutils
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# NLTK
from nltk.corpus import stopwords

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

# re
import re

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Scipy
import scipy.sparse

# spaCy
import spacy
# nlp = spacy.load('en_core_web_sm')

# Silence Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)



In [4]:
neutral = pd.read_csv('../data/neutral_sample.csv')
toxic = pd.read_csv('../data/toxic.csv')

In [5]:
neutral.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16225 entries, 0 to 16224
Data columns (total 22 columns):
toxic                  16225 non-null int64
severe_toxic           16225 non-null int64
obscene                16225 non-null int64
threat                 16225 non-null int64
insult                 16225 non-null int64
identity_hate          16225 non-null int64
neutral                16225 non-null int64
comment_text           16225 non-null object
cleaner_text           16225 non-null object
compound               16225 non-null float64
neg                    16225 non-null float64
neu                    16225 non-null float64
pos                    16225 non-null float64
word count             16225 non-null int64
character count        16225 non-null int64
special characters     16225 non-null int64
capitalized letters    16225 non-null int64
swear words            16225 non-null int64
misogynist words       16225 non-null int64
racial_ethnic slurs    16225 non-null int64
l

In [None]:
toxic.info()

To prepare for topic modeling with Latent Dirichlet Allocation, the lemmatized text will be vectorized using a TF-IDF vectorizer and converted to a term document matrix than is compatible with the Gensim NLP library. 

For the TF-IDF Vectorizer I'm going to leave the default settings. At this point I don't have a great way to test different parameters since I'll be using LDA, which is unsupervised. We'll see how the default settings go and try to tinker from there as needed.

In [8]:
def tvec(df, column):
    tvec = TfidfVectorizer(max_features = 10000, ngram_range = (1,2), stop_words = 'english')
    dtm = tvec.fit_transform(df[column])
    return tvec, dtm

In [9]:
toxic_vectorizer, toxic_dtm = tvec(toxic, 'lemmatized_text')
neutral_vectorizer, neutral_dtm = tvec(neutral, 'lemmatized_text') 

#### Transform Document Term Matrix into a Term Document Matrix that can be recognized by Gensim

Gensim requires the format of the input to be a Gensim corpus data type. In order to convert the document-term matrix to a corpus, we must first convert it to a term-document matrix, which is the tranpose of the document-term matrix. Once the term-document matrix is created, it is converted to a compressed sparse row matrix, and finally a Gensim corpus.

**Note**: Code for these steps was modified from [A Dash of Data](https://github.com/adashofdata/nlp-in-python-tutorial) by Alice Zhao

In [10]:
def create_tdm(dtm):
    tdm = dtm.transpose() # transpose document-term matrix
    return tdm

def tdm_to_gensim_corpus(tdm):
    sparse = scipy.sparse.csr_matrix(tdm) # Create a compressed sparse row matrix
    corpus = matutils.Sparse2Corpus(tdm) # Create a corpus from sparse matrix
    return corpus

def prepare_dtm_for_gensim(dtm):
    tdm = create_tdm(dtm)
    corpus = tdm_to_gensim_corpus(tdm)
    return corpus

#### convert to corpus

In [11]:
toxic_corpus = prepare_dtm_for_gensim(toxic_dtm)
neutral_corpus = prepare_dtm_for_gensim(neutral_dtm)

## Defining dictionary

Gensim also requires that a dictionary, containing the vocabulary and index of the term, is input into the model. This can be done using a dictionary comprehension or using the `.from_corpus` method. 

#### Dictionary to be used with modeling
For modeling, a dictionary was created using a dictionary comprehension with the vectorizer vocabulary.

In [12]:
# Code modified from A Dash of Data by Alice Zhao

def create_vocab_dictionary(vectorizer):
    vocab_dictionary = dict((v,k) for k,v in vectorizer.vocabulary_.items())
    return vocab_dictionary

In [13]:
toxic_vocabulary = create_vocab_dictionary(toxic_vectorizer)
neutral_vocabulary = create_vocab_dictionary(neutral_vectorizer)

#### Dictionary to be used with pyLDAvis
The ***pyLDAvis*** library requires the Dictionary to be a Gensim object. Therefore, the Gensim corpus was converted to a gensim dictionary.

In [14]:
# https://stackoverflow.com/questions/21552518/using-scikit-learn-vectorizers-and-vocabularies-with-gensim

toxic_gensim_dict = Dictionary.from_corpus(corpus = toxic_corpus, id2word = toxic_vocabulary)
neutral_gensim_dict = Dictionary.from_corpus(corpus = neutral_corpus, id2word = neutral_vocabulary)

## Topic Modeling with Latent Dirichlet Allocation

In order to identify possible topics that are more prone to toxic speech **Latent Dirichlet Allocation (LDA)** was conducted.

The general process of LDA is:
1. Choose a number of topics (K)
2. For each document in the corpus, randomly assign the words in a document to a topic. (This step generates a list of topics in the documents and a list of words for each topic.)
3. Iterate over the document, reassigning the words and topics based on p(word|topic) and p(topic|document) until a steady state is reached.

After this process, you can view the topic probability distribution for each document and the word probability distribution for each topic. 

The Gensim topic modeling library provides an LDA model that will be used to identify topics in each corpus. Because LDA is an iterative process, the LDA model will run for 50 iterations.

In [15]:
def build_gensim_lda(corpus, n_topics, vocabulary, n_passes):
    lda = LdaModel(corpus = corpus, num_topics = n_topics, id2word = vocabulary, passes = n_passes, decay = 0.7, offset = 10, random_state = 2020)
    return lda

In [16]:
def build_gensim_coherence_model(model, corpus, dictionary):
    coherence = CoherenceModel(model = model, corpus = corpus, dictionary = dictionary, coherence = 'u_mass')
    return round(coherence.get_coherence(), 3)

In [17]:
def get_topic_terms(model, n_topics, n_words):
    return model.print_topics(num_topics = n_topics, num_words = n_words)

In [18]:
# Function Written By Selva Prabhakaran
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

def assign_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]): # iterate over transformed corpus to get topic probabilities for document
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Probability and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break

    sent_topics_df.columns = ['Dominant_Topic', 'Probability', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    
    sent_topics_df.columns = ['Dominant_Topic', 'Probability', 'Keywords', 'Original_Text']
    
    return sent_topics_df

In [19]:
def display_pyLDAvis(model, corpus, gensim_dict):
    prepared = pyLDAvis.gensim.prepare(model, corpus, gensim_dict)
    display = pyLDAvis.display(prepared)
    return display

##### Toxic Corpus


- It's challenging to know how many topics to try to create since this is an unsupervised learning model, but I'll start with 5. I believe there are ways to tune this to find a more optimal number depending on the corpus, but I'll start here as a proof concept

In [20]:
#### Build a LDA model with 5 topics

toxic_lda_model = build_gensim_lda(toxic_corpus, 
                               n_topics = 5, 
                               vocabulary = toxic_vocabulary, 
                               n_passes = 50)

In [21]:
#### Evaluate Model on UMass Coherence

build_gensim_coherence_model(toxic_lda_model, corpus = toxic_corpus, dictionary = toxic_vocabulary)

-7.133

The Toxic LDA model with 5 topics has a coherence score of -7.133. Given that UMass coherence scores can range from 0 (best) to -14 (worst), this is an ok score and indicates that words within a topic are similar, but this could certainly use some tuning.

In [22]:
#### Evaluate Topic Separation and Check Relevant Terms using pyLDAvis

display_pyLDAvis(toxic_lda_model, toxic_corpus, toxic_gensim_dict)

The ***pyLDAvis*** library was built to enable easier interpretation of topic models. On the left, the topics are plotted across two principle components. Similarity of topics is shown by proximity of topic circles, and the size demonstrates how prevalent the topic is in the corpus. On the right, the 30 most salient terms in the corpus are shown. If the cursor is placed over a topic circle, the right panel changes to show the relevant terms to the topic.

##### Neutral Corpus

In [23]:
neutral_lda_model = build_gensim_lda(neutral_corpus, 
                               n_topics = 5, 
                               vocabulary = neutral_vocabulary, 
                               n_passes = 50)

In [24]:
build_gensim_coherence_model(neutral_lda_model, corpus = neutral_corpus, dictionary = neutral_vocabulary)

-4.424

The Neutral LDA model with 5 topics has a coherence score of -4.424. Given that UMass coherence scores can range from 0 (best) to -14 (worst), this is a fairly good score and indicates that words within a topic are similar.

In [25]:
#### Evaluate Topic Separation and Check Relevant Terms using pyLDAvis

display_pyLDAvis(neutral_lda_model, neutral_corpus, neutral_gensim_dict)

#### Investigate topics

With the Gensim library, I am able to view the word distributions that were calculated for each topic, and the topic distributions that were determined for each document. The interpretation process will be described below.

In [26]:
def get_topic_terms(model, n_topics, n_words):
    return model.print_topics(num_topics = n_topics, num_words = n_words)

In [27]:
# Function Written By Selva Prabhakaran
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

def assign_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]): # iterate over transformed corpus to get topic probabilities for document
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break

    sent_topics_df.columns = ['Dominant_Topic', 'Probability', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    
    sent_topics_df.columns = ['Dominant_Topic', 'Probability', 'Keywords', 'Original_Text']
    
    return sent_topics_df

In [28]:
def plot_topics_in_corpus(df, column, color, title = None, x_label = None):
    plt.figure(figsize = (15, 8))
    round(df[column].value_counts(normalize = True) * 100, 2).sort_values(ascending = True).plot.barh(color = color, ec = 'k', width = 0.75);
    plt.title(title, fontdict = {'fontsize': 20}, pad = 18)
    plt.xlabel(x_label, fontdict = {'fontsize': 15}, labelpad = 10)
    plt.xticks(size = 14)
    plt.yticks(size = 14)
    plt.tight_layout();

In [29]:
def filter_df_by_topic(df, column, topic):
    filtered_df = df[df[column] == topic]
    filtered_df = filtered_df.sort_values(by = "Probability", ascending = False)
    return filtered_df

In [30]:
def get_top_posts(df):
    topic_df_dict = {}
    top_posts_dict = {}
    percent_dominant = {}
    for topic in df['Dominant_Topic'].unique():
        topic_df_dict[topic] = filter_df_by_topic(df, 'Dominant_Topic', topic)
        topic_df = topic_df_dict[topic]
        top_post = topic_df[topic_df['Probability'] == topic_df['Probability'].max()]['Original_Text'].values
        top_posts_dict[topic] = profanity.censor(top_post[0])
        percent_dominant[topic] = topic_df['Probability'].max()
    
    return top_posts_dict, percent_dominant

#### Explore Top Terms for Each Topic
**Note:** Top terms are determined based on their probability for occuring in the topic.  

For each topic, we are able to view the terms with the highest probability of appearing in the topic. An example is shown in the cell below:

In [31]:
get_topic_terms(toxic_lda_model, 1, 10) # Will show the words with the highest probability for topic chosen at random

[(1,
  '0.020*"suck" + 0.016*"gay" + 0.013*"bitch" + 0.013*"fuck" + 0.012*"dick" + 0.011*"penis" + 0.011*"asshole" + 0.009*"cock" + 0.007*"fag" + 0.007*"fucking"')]

Above, we see a list containing a tuple. The first item in the tuple is the topic number, and the second item is a string that contains the probability of a word appearing in the topic and the word itself. I have chosen to display only the top five words.

We can save a list of of the topics and top terms to a list (below), and use indexing to explore the topics and terms in this list. 
This exploration allows us to derive meaning from the topics. 

In [32]:
toxic_lda_topic_terms = get_topic_terms(toxic_lda_model, 10, 10)

In [33]:
toxic_lda_topic_terms

[(0,
  '0.037*"fuck" + 0.013*"fuck fuck" + 0.010*"bitch" + 0.009*"ha" + 0.009*"douche" + 0.007*"son" + 0.007*"nigga" + 0.006*"fuckin" + 0.006*"ball" + 0.006*"son bitch"'),
 (1,
  '0.020*"suck" + 0.016*"gay" + 0.013*"bitch" + 0.013*"fuck" + 0.012*"dick" + 0.011*"penis" + 0.011*"asshole" + 0.009*"cock" + 0.007*"fag" + 0.007*"fucking"'),
 (2,
  '0.020*"faggot" + 0.016*"fucking" + 0.015*"shit" + 0.012*"fuck" + 0.012*"piece" + 0.011*"piece shit" + 0.010*"cunt" + 0.009*"fucker" + 0.008*"mother" + 0.007*"nigger"'),
 (3,
  '0.006*"page" + 0.005*"wikipedia" + 0.005*"like" + 0.004*"fuck" + 0.004*"article" + 0.004*"know" + 0.004*"people" + 0.004*"fucking" + 0.004*"stop" + 0.004*"talk"'),
 (4,
  '0.018*"na" + 0.012*"gon" + 0.012*"gon na" + 0.007*"vagina" + 0.007*"wan" + 0.007*"wan na" + 0.007*"kiss" + 0.005*"dickhead" + 0.004*"jeff" + 0.004*"thanks"')]

In [34]:
neutral_lda_topic_terms = get_topic_terms(neutral_lda_model, 10, 10)

In [35]:
neutral_lda_topic_terms

[(0,
  '0.024*"utc" + 0.010*"2005" + 0.007*"16" + 0.007*"2005 utc" + 0.006*"2004" + 0.006*"2008" + 0.006*"2007" + 0.006*"13" + 0.006*"21" + 0.006*"2006"'),
 (1,
  '0.021*"image" + 0.020*"redirect" + 0.016*"redirect talk" + 0.012*"copyright" + 0.011*"talk" + 0.008*"use" + 0.006*"fair use" + 0.006*"fair" + 0.005*"barnstar" + 0.005*"file"'),
 (2,
  '0.007*"article" + 0.005*"page" + 0.004*"talk" + 0.004*"source" + 0.003*"wikipedia" + 0.003*"think" + 0.003*"like" + 0.003*"know" + 0.003*"thanks" + 0.003*"user"'),
 (3,
  '0.011*"page" + 0.011*"wikipedia" + 0.011*"editing" + 0.010*"welcome" + 0.009*"blocked" + 0.008*"blocked editing" + 0.008*"talk" + 0.008*"continue" + 0.007*"vandalize" + 0.006*"edit"'),
 (4,
  '0.015*"deletion" + 0.008*"speedy" + 0.007*"speedy deletion" + 0.007*"article" + 0.006*"page" + 0.005*"criterion" + 0.005*"deleted" + 0.005*"template" + 0.004*"tag" + 0.004*"note"')]

Source List:
    
    https://www.youtube.com/watch?v=TKjjlp5_r7o