# Reddit comment toxicity classifier: Multinomial Naive Bayes
# with hyperopt param tuning

### John Burt

[To hide code cells, view this in nbviewer](https://nbviewer.jupyter.org/github/johnmburt/springboard/blob/master/capstone_1/reddit_toxicity_detection_model_MNB_v1.ipynb) 


### Introduction:

The goal of my first Capstone project is to develop a toxic comment classifier. This notebook will implement and HP tune a Multinomial Naive Bayes classifier.


to do:
- X autosave prepped text versions
- X pull out holdout set before training
- X balance classes
- generate features:
  - user karma
  - mean user score by sub
- hyperopt
- confusion matrix with model results


## Load the data.

The comment data used in this analysis was [acquired using Reddit Python API PRAW](https://github.com/johnmburt/springboard/blob/master/capstone_1/reddit_collect_comments_v1.ipynb) from 12 subs. 8 of the subs are non-political, and 4 are political in nature. 

The raw comment data was [processed using PCA to produce a single toxicity score](https://github.com/johnmburt/springboard/blob/master/capstone_1/reddit_generate_PCA_score_v1.ipynb) based on the votes and number of replies. 

Then I [converted this score into an integer 0 to 4 range training label variable](https://github.com/johnmburt/springboard/blob/master/capstone_1/reddit_create_train-test_set.ipynb), with 0 being no/low toxicity and higher values indicating higher toxicity. 

Note that this is a highly unbalanced dataset, with less than 10% of comments having toxicity label values above 0. I'll have to adjust this proportion for models that require reasonably balanced categories.


In [2]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

import pandas as pd
pd.options.display.max_columns = 100

import numpy as np
import datetime
import time
import csv
import glob

## Load the prepared comment data file,
## or prepare original comment text and save


- Try to load a pre-saved data file that contains processed comment text. 
- If the pre-saved comment file doesn't exist then: 
  - Clean up text and prepare it for NLP model training. 
  - Save the processed text so that it can be read in later sessions without waiting.
  - In the next session, the pre-processed data file will be read if it exists, otherwise the text will be pre-processed and saved.

In [3]:
import re
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords as sw

# function to prepare text for NLP analysis
def process_comment_text(comments, 
                         stemmer=None, 
                         regexstr=None, lowercase=True,
                         removestop=False,
                         verbose=True):
    """Helper function to pre-process text.
        Combines several preprocessing steps: lowercase, 
            remove stop, regex text cleaning, stemming"""
    
    if type(stemmer) == str:
        if stemmer.lower() == 'porter':
            stemmer = PorterStemmer()
        elif stemmer.lower() == 'snowball':
            stemmer = SnowballStemmer(language='english')
        else:
            stemmer = None
            
    processed = comments
    
    # make text lowercase
    if lowercase == True:
        if verbose: print('make text lowercase')
        processed = processed.str.lower()
        
    # remove stop words
    # NOTE: stop words w/ capitals not removed!
    if removestop == True:
        if verbose: print('remove stop words')
        stopwords = sw.words("english")
        processed = processed.map(lambda text: ' '.join([word for word in text.split() if word not in stopwords]))
        
    # apply regex expression
    if regexstr is not None:
        if verbose: print('apply regex expression')
        regex = re.compile(regexstr) 
        processed = processed.str.replace(regex,' ')
        
    # stemming
    # NOTE: stemming makes all lowercase
    if stemmer is not None:
        if verbose: print('stemming')
        processed = processed.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
        
    if verbose: print('done')
         
    return processed


from os import path

# source data folder 
srcdir = './data_labeled/'

# csv filename
inputname = 'comment_sample_train-test_data'

# specify parameters for text prep
# NOTE: if these are changed, then a new prepped text file 
#  will be created.
processkwargs = {
    'stemmer':'snowball', # snowball stemmer
    'regexstr':'[^a-zA-Z0-9\s]', # remove all but alphanumeric chars
    'lowercase':True, # make lowercase
    'removestop':False # don't remove stop words 
                }

# create name for prepared text file
prepfile = inputname+' stem=%s, regex=%s, lower=%d, stop=%d.csv'%(
    processkwargs['stemmer'], 
    processkwargs['regexstr'].replace('\\','(sl)'), 
    processkwargs['lowercase'], 
    processkwargs['removestop'])

preppath = srcdir+prepfile

# if prepared text file exists, just load it
# if False:
if path.exists(preppath):
    print('reading prepped text file: ', prepfile)
    df = pd.read_csv(preppath).drop_duplicates()
    
# if prepared file doesn't exist, then create it and save it
else:
    print('no prepped file, so reading original and prepping text...')
    # read the original
    origpath = srcdir+inputname+'.csv'
    df = pd.read_csv(origpath).drop_duplicates()
    # process text, make that the text version of the training data
    verbose = True
    df['text_prep'] = process_comment_text(df['text'], **processkwargs, verbose=verbose)
    # save prepared text file
    df.to_csv(preppath,index=False)
    print('saved prepped text data to',preppath)
    
# remove any rows with nans
df.dropna(inplace=True)

# remove any rows with no text after pre-processing
df = df[df['text_prep']!='']

# select comments from the specified subs
subs2use = ['gaming']
df = df[df['sub_name'].str.contains('|'.join(subs2use))]
print(df.shape)

print('\nTotal comment samples read:',df.shape[0])

reading prepped text file:  comment_sample_train-test_data stem=snowball, regex=[^a-zA-Z0-9(sl)s], lower=1, stop=0.csv
(389944, 24)

Total comment samples read: 389944


## Create the feature and label data for training and testing.

In [5]:
from sklearn.model_selection import train_test_split

# the label used = 0-4 scale, w/ 4 = most toxic
# y = df['label_neg-inv']

# use the binary label: 0 = not toxic, 1 = toxic
y = df['label_bin']

# clip ranges
y[y<0] = 0
y[y>1] = 1

# prepared comment text to use for model
X = df['text_prep']

# Split into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('%d training samples, %d test samples'%(X_train.shape[0],X_test.shape[0]))


350949 training samples, 38995 test samples


## Balance sample frequencies in training samples

The classifier may require balancing of sample frequencies between classes for best results.

This function will up-sample to the specified number of samples per class.

The balance_classes_sparse function does sample balancing with sparse matrices, such as vectorized BOW data.

In [6]:
def balance_classes(df, ycol, samples_per_class=None, verbose=False):
    """Equalize number of samples so that all classes have equal numbers of samples.
    If samples_per_class==None, then upsample (randomly repeat) all classes to the largest class,
      Otherwise, set samples for all classes to samples_per_class."""
    
    if verbose: print('Balancing class sample frequencies:')
    
    # all class IDs
    classes =  df[ycol].unique()
    classes = classes[~np.isnan(classes)]
    
    # get class with max samples
    if verbose: print('\tOriginal sample frequencies:')
    if samples_per_class is None:
        samples_per_class = 0
        for c in classes:
            if verbose: print('\t\tclass:',c,'#samples:',(df[ycol]==c).sum())
            samples_per_class = np.max([samples_per_class, (df[ycol]==c).sum()])
    if verbose: print('\tNew samples_per_class:',samples_per_class)
            
    # create a list of samples for each class with equal sample numbers 
    newdata = []
    for c in classes:
        newdata.append(df[df[ycol]==c].sample(samples_per_class, replace=True)) 

    return pd.concat(newdata)

# ******************************************************************************************
from scipy.sparse import vstack, hstack
from scipy.sparse.csr import csr_matrix

def balance_classes_sparse(X, y, samples_per_class=None, verbose=False):
    """Equalize number of samples so that all classes have equal numbers of samples.
    If samples_per_class==None, then upsample (randomly repeat) all classes to the largest class,
      Otherwise, set samples for all classes to samples_per_class."""
    
    def get_samples(arr, numsamples):
        if arr.shape[0] >= numsamples:
            index = np.arange(arr.shape[0])
            np.random.shuffle(index)
            return arr[index[:numsamples],:]
        else:
            samples = arr.copy()
            numrepeats = int(numsamples / arr.shape[0])
            lastsize = numsamples % arr.shape[0]
            for i in range(numrepeats-1):
                samples = vstack([samples,arr])
            if lastsize > 0:
                index = np.arange(arr.shape[0])
                np.random.shuffle(index)
                samples = vstack([samples, arr[index[:lastsize],:]])
            return samples   
    
    if verbose: 
        print('Balancing class sample frequencies:')
        
    # all class IDs
    classes =  pd.unique(y)
    classes = classes[~np.isnan(classes)]
    
    # get class with max samples
    if verbose: 
        print('\tOriginal sample frequencies:')
    if samples_per_class is None:
        samples_per_class = 0
        for c in classes:
            if verbose: 
                print('\t\tclass:',c,'#samples:',(np.sum(y==c)))
            samples_per_class = np.max([samples_per_class, np.sum(y==c)])
    if verbose: 
        print('\tNew samples_per_class:',samples_per_class)
                              
    # combine X and y
    Xy = csr_matrix(hstack([X, np.reshape(y, (-1, 1))]))
       
    # create a list of samples for each class with equal sample numbers 
    newdata = None
    for c in classes:
        if newdata is None:
            newdata = get_samples(Xy[y==c,:], samples_per_class)
        else:
            newdata = vstack([newdata, get_samples(Xy[y==c,:], samples_per_class)])
            
    return newdata[:,:-1], newdata[:,-1].toarray()


## Hyperparameter optimization using Baysian methods


Note: this article provided useful code for pipelines with hyperopt:

- [Hyperparameter Tuning with hyperopt in Python](http://steventhornton.ca/blog/hyperparameter-tuning-with-hyperopt-in-python.html)

In [7]:
from hyperopt import tpe, hp, fmin, Trials
from sklearn.model_selection import cross_val_score
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import BaseEstimator, ClassifierMixin

# create a custom classifier that balances the training data
class BalancedClf(MultinomialNB):
    """Wrapper class that balances data by upsampling prior to training"""
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def fit(self, X, y, **fit_params):
        bal_X, bal_y = balance_classes_sparse(X, y, verbose=False)
        super().fit(bal_X, bal_y, **fit_params)
        return self
    
# objective function 
def objective(params):
#     print('params:',params)  
    clf = Pipeline([    
        ('tfidf', TfidfVectorizer()),
        ('clf', BalancedClf())])    
    clf.set_params(**params)    
    score = cross_val_score(clf, X_train.values, y_train.values, 
                            scoring='balanced_accuracy',n_jobs=4).mean()   
    return -score

# parameter space
paramspace = {
    'tfidf__stop_words': hp.choice('tfidf__stop_words', ['english', None]),
    'tfidf__use_idf': hp.choice('tfidf__use_idf', [True, False]),
    'tfidf__sublinear_tf': hp.choice('tfidf__sublinear_tf', [True, False]),
    'tfidf__min_df': 1+hp.randint('tfidf__min_df', 5),
    'tfidf__max_df': hp.uniform('tfidf__max_df', 0.5, 1.0),
    'tfidf__ngram_range': hp.choice('tfidf__ngram_range', [(1, 1), (1, 3)])
    }

# The Trials object will store details of each iteration
trials = Trials()

# Run the hyperparameter search using the tpe algorithm
best = fmin(fn=objective, space=paramspace, algo=tpe.suggest, max_evals=100, trials=trials)
print('best:')
print(best)

100%|████████████████████████████████████████████████| 100/100 [45:50<00:00, 34.29s/it, best loss: -0.6331680292740242]
best:
{'tfidf__max_df': 0.8634865274084329, 'tfidf__min_df': 0, 'tfidf__ngram_range': 0, 'tfidf__stop_words': 1, 'tfidf__sublinear_tf': 1, 'tfidf__use_idf': 1}
