# ML Pipeline for Detection of Aggressive/Nonconstructive Dialogue

In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora, models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import shuffle
from sklearn.externals import joblib
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline as imb_Pipeline
import string
import tensorflow as tf

  return f(*args, **kwds)


### Preprocessing

In [2]:
wiki_comments = pd.read_csv('toxic-data/toxicity_annotated_comments.tsv', encoding='latin-1', sep='\t')
wiki_comments['comment'] = wiki_comments.comment.str.replace('NEWLINE_TOKEN', '')
wiki_comments.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,2232.0,This::One can make an analogy in mathematical ...,2002,True,article,random,train
1,4216.0,"`:Clarification for you (and Zundark's right,...",2002,True,user,random,train
2,8953.0,Elected or Electoral? JHK,2002,False,article,random,test
3,26547.0,`This is such a fun entry. DevotchkaI once h...,2002,True,article,random,train
4,28959.0,Please relate the ozone hole to increases in c...,2002,True,article,random,test


In [3]:
wiki_annotated = pd.read_csv('toxic-data/toxicity_annotations.tsv', encoding='latin-1', sep='\t')
wiki_annotated.head()

Unnamed: 0,rev_id,worker_id,toxicity,toxicity_score
0,2232.0,723,0,0.0
1,2232.0,4000,0,0.0
2,2232.0,3989,0,1.0
3,2232.0,3341,0,0.0
4,2232.0,1574,0,1.0


In [4]:
scores = [0 if score >= 0 else -score for score in wiki_annotated['toxicity_score']]
wiki_annotated['toxicity_score'] = scores
wiki_annotated.head()

Unnamed: 0,rev_id,worker_id,toxicity,toxicity_score
0,2232.0,723,0,0.0
1,2232.0,4000,0,0.0
2,2232.0,3989,0,0.0
3,2232.0,3341,0,0.0
4,2232.0,1574,0,0.0


In [5]:
wiki = wiki_comments[['rev_id','comment']].merge(wiki_annotated[['rev_id','toxicity_score']], on='rev_id')
wiki = wiki.groupby(['rev_id','comment']).agg(lambda x: x.value_counts().index[0]).reset_index()
wiki = wiki[['comment', 'toxicity_score']]
wiki.columns = ['text', 'toxicity']
wiki.head()

Unnamed: 0,text,toxicity
0,This::One can make an analogy in mathematical ...,0.0
1,"`:Clarification for you (and Zundark's right,...",0.0
2,Elected or Electoral? JHK,0.0
3,`This is such a fun entry. DevotchkaI once h...,0.0
4,Please relate the ozone hole to increases in c...,0.0


In [6]:
X = wiki
X = X.sample(frac=1).reset_index()
X = X.drop(X.columns[0], axis=1)
X.head()

Unnamed: 0,text,toxicity
0,REDIRECT Talk:U.S. Route 64 Alternate (Rocky M...,0.0
1,"== Couple changes ==I made a couple changes, e...",0.0
2,`Please do not remove messages from your talk ...,0.0
3,"Darth Binkys name is in red, and says mec wha...",0.0
4,`*The following is what is currently written i...,0.0


#### For efficiency, consider only a subset of the text data

In [7]:
X = X.iloc[0:10000,:]
X.shape

(10000, 2)

#### Apply PorterStommer and coerce text to appropriate form

In [8]:
texts = []
p_stemmer = PorterStemmer()
table = str.maketrans({key: None for key in string.punctuation})
for index, row in X.iterrows():
    text = row['text'].lower().translate(table)
    text = text.encode("ascii", errors="ignore").decode()
    text = nltk.word_tokenize(text)
    text = [p_stemmer.stem(token) for token in text if token not in stopwords.words('english')]
    texts.append(text)
    text = ' '.join(text)
    X.loc[index,'text'] = text
X.head()

Unnamed: 0,text,toxicity
0,redirect talku rout 64 altern rocki mounttarbo...,0.0
1,coupl chang made coupl chang explain hereremov...,0.0
2,pleas remov messag talk page talk page exist r...,0.0
3,darth binki name red say mec what clan call,0.0
4,follow current written religion section fine i...,0.0


#### Train/Test Partition

In [9]:
trainIndex = int(0.65*X.shape[0])
trainSet = X.iloc[0:trainIndex,:]
testSet = X.iloc[trainIndex:,:]
trainTexts, testTexts = texts[0:(trainIndex+1)], texts[trainIndex:]

### Exploratory Topic Modeling with Latent Dirichlet Allocation

In [10]:
dictionary = corpora.Dictionary(trainTexts)
corpus = [dictionary.doc2bow(text) for text in trainTexts]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=10)
print(ldamodel.print_topics(num_topics=3, num_words=5))

[(0, '0.004*"one" + 0.003*"like" + 0.003*"test" + 0.003*"also" + 0.003*"concernthank"'), (1, '0.020*"articl" + 0.008*"sourc" + 0.007*"would" + 0.006*"use" + 0.006*"one"'), (2, '0.019*"page" + 0.014*"edit" + 0.010*"wikipedia" + 0.010*"block" + 0.009*"pleas"')]


## Text Classification of Aggressive/Nonconstructive/Constructive Dialogue

### SMOTE - Synthetic Minority Over-sampling to Accommodate Inbalanced Class Labels

In [11]:
#smt = SMOTE(random_state=42)
vec = CountVectorizer()
tfidf = TfidfTransformer()
clf = MultinomialNB()
pipeline = imb_Pipeline([('vect', vec),
                         ('tfidf', tfidf),
                         #('smt', smt), 
                         ('clf', clf)
])

### Tuned Multinomial NB Classifier

In [12]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'vect__analyzer': ('word', 'char'),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (0.0001, 0.001, 0.01, 0.1, 1.0)
}
pipeline = RandomizedSearchCV(pipeline, parameters, n_jobs=-1)

In [13]:
pipeline = pipeline.fit(trainSet.text, trainSet.toxicity)



#### Multinomial NB Classification Report

In [14]:
predicted = pipeline.predict(testSet.text)
print(classification_report_imbalanced(predicted, testSet.toxicity))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.92      0.88      0.96      0.53      0.30      3387
        1.0       0.21      0.59      0.93      0.31      0.46      0.19       108
        2.0       0.04      0.40      0.99      0.07      0.19      0.03         5

avg / total       0.97      0.91      0.88      0.94      0.52      0.29      3500



#### Generate Predictions on Test Set

In [15]:
predicted_prob = pipeline.predict_proba(testSet.text)
predicted_prob = np.matrix.round(predicted_prob, 3)
predicted_prob = pd.DataFrame(predicted_prob)
predicted_prob['text'] = list(testSet.text)
predicted_prob['ground truth'] = list(testSet.toxicity)
predicted_prob

Unnamed: 0,0,1,2,text,ground truth
0,1.000,0.000,0.000,specul sourc notabl sens gener known eg appear...,0.0
1,1.000,0.000,0.000,agre complet polici debat belong somewher els ...,0.0
2,1.000,0.000,0.000,didnt malign anyon person anyon name mention a...,0.0
3,1.000,0.000,0.000,may misrepres said repeatedli past may opinion...,0.0
4,1.000,0.000,0.000,mark amerikaa far tell compar work look mark p...,0.0
5,0.899,0.097,0.005,give messag,0.0
6,1.000,0.000,0.000,krystal jenkin hi ad dbbio templat articl krys...,0.0
7,0.999,0.001,0.000,problem may want drop unblock request talk pag...,0.0
8,0.965,0.034,0.001,wont let dont,0.0
9,0.980,0.020,0.000,say leav everyth written psa written togeth al...,0.0


#### Save the NB model

In [16]:
joblib.dump(pipeline, 'pipeline_NB.pkl')

['pipeline_NB.pkl']

In [17]:
#smt = SMOTE(random_state=42)
vec = CountVectorizer()
tfidf = TfidfTransformer()
clf = SGDClassifier(random_state=42, tol=None, class_weight={1:50,2:100})
pipeline = imb_Pipeline([('vect', vec),
                         ('tfidf', tfidf),
                         #('smt', smt), 
                         ('clf', clf)
])

In [18]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'vect__analyzer': ('word', 'char'),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (0.0001, 0.001, 0.01, 0.1, 1.0),
              'clf__penalty': ('l2', 'elasticnet'),
              'clf__max_iter': (5, 25, 50, 75, 100, 200),
              'clf__loss': ('modified_huber', 'hinge')
}
svm_clf = RandomizedSearchCV(pipeline, parameters, n_jobs=-1)

In [19]:
svm_clf = svm_clf.fit(trainSet.text, trainSet.toxicity)
svm_clf.best_score_



0.9184615384615384

#### SVM Classification Report

In [20]:
predicted = svm_clf.predict(testSet.text)
print(classification_report_imbalanced(predicted, testSet.toxicity))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.93      0.94      0.96      0.58      0.36      3371
        1.0       0.25      0.68      0.93      0.37      0.50      0.23       113
        2.0       0.15      0.50      0.99      0.23      0.39      0.14        16

avg / total       0.97      0.92      0.94      0.94      0.58      0.36      3500



#### Generate Predictions on Test Set

In [22]:
predicted = pd.DataFrame(predicted)
predicted[1] = list(testSet.toxicity)
predicted[2] = list(testSet.text)
predicted.columns = ['prediction', 'ground truth', 'text']
predicted

Unnamed: 0,prediction,ground truth,text
0,0.0,0.0,specul sourc notabl sens gener known eg appear...
1,0.0,0.0,agre complet polici debat belong somewher els ...
2,0.0,0.0,didnt malign anyon person anyon name mention a...
3,0.0,0.0,may misrepres said repeatedli past may opinion...
4,0.0,0.0,mark amerikaa far tell compar work look mark p...
5,0.0,0.0,give messag
6,0.0,0.0,krystal jenkin hi ad dbbio templat articl krys...
7,0.0,0.0,problem may want drop unblock request talk pag...
8,0.0,0.0,wont let dont
9,0.0,0.0,say leav everyth written psa written togeth al...


#### Save the SVM model

In [23]:
joblib.dump(svm_clf, 'pipeline_SVM.pkl')

['pipeline_SVM.pkl']

### Deep Learning Pipeline with RNNs and TensorFlow

In [24]:
MAX_DOCUMENT_LENGTH = 5000
EMBEDDING_SIZE = 100
n_words = 0
MAX_LABEL = 3
WORDS_FEATURE = 'text'

In [25]:
def softmax_classification(logits, y, mode):
    
    predicted = tf.argmax(logits, 1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'class': predicted,
                'prob': tf.nn.softmax(logits)
            })
    
    objective = tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        # no need to tune learning rate with Adam optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        trainScore = optimizer.minimize(objective, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=objective, train_op=trainScore)
    
    evaluated = {'accuracy': tf.metrics.accuracy(labels=y, predictions=predicted)}
    
    return tf.estimator.EstimatorSpec(mode=mode, loss=objective, eval_metric_ops=evaluated)

In [26]:
def RNN(features, labels, mode):
    
    word_vectors = tf.contrib.layers.embed_sequence(
        features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
    word_list = tf.unstack(word_vectors, axis=1)
    
    cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)
    
    _, encoding = tf.nn.static_rnn(cell, word_list, dtype=tf.float32)
    
    logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
    
    return softmax_classification(logits=logits, y=labels, mode=mode)

In [27]:
global n_words
tf.logging.set_verbosity(tf.logging.INFO)

processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
trainTransform = processor.fit_transform(trainSet.text)
testTransform = processor.fit_transform(testSet.text)
trainTransform = np.array(list(trainTransform))
testTransform = np.array(list(testTransform))

n_words = len(processor.vocabulary_)
num_steps = 100 # number of optimization steps

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.


#### Model Training

In [28]:
trainLabels = np.array(trainSet.toxicity)
testLabels = np.array(testSet.toxicity)

In [None]:
rnn = tf.estimator.Estimator(model_fn=RNN)
trainFunc = tf.estimator.inputs.numpy_input_fn(
    x={WORDS_FEATURE: trainTransform},
    y=trainLabels,
    batch_size=len(trainTransform),
    num_epochs=None,
    shuffle=True)
rnn.train(input_fn=trainFunc, steps=num_steps)

#### Generate Predictions on Test Set

In [None]:
testFunc = tf.estimator.inputs.numpy_input_fn(
    x={WORDS_FEATURE: testTransform}, y=testLabels, num_epochs=1, shuffle=False)
pred = rnn.predict(input_fn=testFunc)
pred = pd.DataFrame(pred)
pred['text'] = list(testSet.text)
pred

#### RNN Classification Report

In [None]:
predicted = np.array(pred['class'])
print(classification_report_imbalanced(predicted, testSet.toxicity))

#### Save the RNN model

In [None]:
joblib.dump(rnn, 'pipeline_RNN.pkl')