# A Toxic Comment Identifier Application - Rocchio Classifier

This notebook contains all methods related to creating and tuning the model for the Rocchio Classifier.  This notebook was mostly use for tuning the Doc2Vec model in the Toxic_App_Doc2Vec.ipynb notebook.

In [5]:
import numpy as np

### Methods for Custom Rocchio Classifier

Below are methods to use the Rocchio Classifier.

In [3]:
# helper function to get the number of classes
def get_number_of_classes(classes):
    return len(np.unique(classes))

"""A Custom Training Function for the Rocchio Classification Algorithm
:param data: an np array of the training data formatted as a doc-term frequency matrix,
a ttd*idf matrix or else a doc2vec vector matrix
:param labels: an np array of the corresponding labels for the data
:returns: the prototype vectors for each class (a.k.a. label)
"""
def rocchio_train(train, labels):
    # the prototype will be a dictionary with unique class labels as keys and on dimensional arrays representing
    # the prototype.
    prototype = {}
    num_labels = get_number_of_classes(labels)
    #print("The number of classes:" + str(num_labels))

    # add each class to the prototype dictionary
    # the class is the key
    # a one dimensional array equal to the length of a term vector array is initialized with all 0s
    for label in labels:
        term_vector_array = np.zeros(np.shape(len(train[0])), dtype=float)
        prototype[label] = term_vector_array

    length = len(train)
    for i in range(length):
        # first figure out which class this entry belongs to
        label = labels[i]

        # next sum up the entire document term vector
        prototype[label] = prototype.get(label) + np.array(train[i])

    return prototype

"""A Classifier Function for the Rocchio Classification Algorithm
This function uses cosine similarity for the distance metric
:param data: a dictionary of prototype vectors where the class is a key and the value is the vector
for the corresponding class.
:param instance: the test instance to classify
:returns: the predicted class
"""
def rocchio_classifier(prototype, test_instance):
    m = -2
    predicted_class = -1  # intialize the predicted class to -1 by default
    for classLabel in prototype:
        # (compute similarity to prototype vector)
        # use cosine similarity
        # cosine distance is 1 - (dot product / L2 norm)
        term_vector = prototype[classLabel]
        prototype_norm = np.array([np.linalg.norm(term_vector) for i in range(len(term_vector))])
        test_instance_norm = np.linalg.norm(test_instance)

        sims = np.dot(term_vector, test_instance) / (prototype_norm * test_instance_norm)

        # get the maximum cosSim
        max_cos_sim = np.max(sims)

        if max_cos_sim > m:
            m = max_cos_sim
            predicted_class = classLabel

    return predicted_class

"""An Evaluation Function for the Rocchio Classification Algorithm
:param test_data: an np array of the test data formatted as a doc-term frequency matrix,
a ttd*idf matrix or else a doc2vec vector matrix
:param test_labels: an np array of the corresponding labels for the data
:param prototype: a dictionary of prototype vectors to use for classification
:returns: the accuracy of the model
"""
def rocchio_evaluate(test_data, test_labels, prototype):
    num_correct = 0

    # iterate through the test instances and call the rocchio classifier
    test_data_len = len(test_data)
    for i in range(test_data_len):
        predicted_label = rocchio_classifier(prototype, test_data[i])
        test_label = test_labels[i]
        #print("Predicted Label:" + str(predicted_label) + " " + "Test Label:" + str(test_labels[i]))
        if predicted_label == test_label:
            num_correct = num_correct + 1
    accuracy = (num_correct / test_data_len * 100.0)
    print("Rocchio classifier - Number of test instances classified correctly:" + str(
        num_correct) + " Percent Accuracy: " + str(accuracy))

### Custom Doc2Vec Parameter Tuning

In [4]:
import itertools
dm = [0, 1]
vector_size = [500, 1000]
window = [2,5]
hs = [1]
paramsList = [{'dm': item[0],
               'vector_size': item[1],
               'window': item[2],
               'hs': 1,
               'negative': 0
               } for item in
                 list(itertools.product(*[dm,
                                          vector_size,
                                          window,
                                          hs]))
              ]

#print(paramsList)

[{'dm': 0, 'vector_size': 500, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 0, 'vector_size': 500, 'window': 5, 'hs': 1, 'negative': 0}, {'dm': 0, 'vector_size': 1000, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 0, 'vector_size': 1000, 'window': 5, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 500, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 500, 'window': 5, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 1000, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 1000, 'window': 5, 'hs': 1, 'negative': 0}]


In [14]:
# from gensim.models import doc2vec
# import pickle
# import numpy as np
# import pandas as pd
#
# #%run Doc2Vec.ipynb
#
# file_object = open('clean_data1.p', 'rb')
# clean_data = pickle.load(file_object)
# train_ds = clean_data[0]
# test_ds = clean_data[1]
#
# #pre-process the corpus
# train_comments_df = train_ds['comment_text']
# test_comments_df = test_ds['comment_text']
# train_labels_df = train_ds['toxicity_level']
# test_labels_df = test_ds['toxicity_level']
#
# def evaluateDoc2VecParams():
#     # Tag docs
#     train_tokenized_comments = tokenize_comments(train_ds['comment_text'])
#     train_tagged_documents = get_tagged_documents(train_tokenized_comments)
#     scoreList = []
#     for param in paramsList:
#       print("Evaluating "+str(param))
#       try:
#         d2v_model = doc2vec.Doc2Vec(train_tagged_documents,
#                         dm=param['dm'],
#                         vector_size=param['vector_size'],
#                         window=param['window'],
#                         min_count=1,
#                         epochs=10,
#                         hs=param['hs'],
#                         seed=516)
#         train_vectors = get_doc2vec_vectors(d2v_model,train_comments_df)
#
#         tokenized_test_comments = tokenize_comments(test_comments_df)
#         test_vectors = infer_vectors(d2v_model, tokenized_test_comments, "")
#
#         train_labels = np.array(train_labels_df)
#
#         #classify test data using prototype vectors
#         test_labels = np.array(test_labels_df)
#
#         rocchio_classifier_nearest_centroid(train_vectors, train_labels, test_vectors, test_labels)
#         # generate the prototype vectors
#         #prototype_vectors = rocchio_train(train_vectors, train_labels)
#
#         #classify test data using prototype vectors
#         #rocchio_evaluate(test_vectors, test_labels, prototype_vectors)
#       except Exception as error:
#         print(f'Cannot evaluate model with parameters {param} because of error: {error}')
#         continue
#     return scoreList
#
# evaluateDoc2VecParams()


Evaluating {'dm': 0, 'vector_size': 500, 'window': 2, 'hs': 1, 'negative': 0}
Nearest Centroid classifier - Number of test instances classified correctly:957 Percent Accuracy: 63.12664907651715
Evaluating {'dm': 0, 'vector_size': 500, 'window': 5, 'hs': 1, 'negative': 0}
Nearest Centroid classifier - Number of test instances classified correctly:955 Percent Accuracy: 62.99472295514512
Evaluating {'dm': 0, 'vector_size': 1000, 'window': 2, 'hs': 1, 'negative': 0}
Nearest Centroid classifier - Number of test instances classified correctly:959 Percent Accuracy: 63.25857519788918
Evaluating {'dm': 0, 'vector_size': 1000, 'window': 5, 'hs': 1, 'negative': 0}
Nearest Centroid classifier - Number of test instances classified correctly:958 Percent Accuracy: 63.19261213720316


[]