# Rocchio Classifier

This notebook contains all methods related to creating and tuning the model for the Rocchio Classifier.

A clean_data.p file was created to store all of the clean data from our training data set.  Below, the clean data is read and uploaded to pandas dataframes.

In [34]:
import pandas as pd
import numpy as np
import pickle
%run Doc2Vec.ipynb

In [35]:
file_object = open('clean_data1.p', 'rb')
clean_data = pickle.load(file_object)
train_ds = clean_data[0]
test_ds = clean_data[1]
#exploring test data
#test_labels_ds = pd.read_csv('test_labels.csv', header=0, na_values=-1)
#test_labels_ds = test_labels_ds[~test_labels_ds.isnull().any(axis=1)]

#print(test_labels_ds.shape)

#print(test_labels_ds.head())

print(train_ds.shape)
print(test_ds.shape)

(5329, 6)
(1669, 6)


In [36]:
from gensim.models import doc2vec
%run Doc2Vec.ipynb
# retrieve the comments only
train_comments_df = train_ds['comment_text']
test_comments_df = test_ds['comment_text']
d2v_model = doc2vec.Doc2Vec(vector_size = 1000, min_count = 5, epochs = 40)

train_vectors = get_doc2vec_vectors(d2v_model, train_comments_df)

 # #using default values for now
tokenized_test_comments = tokenize_comments(test_comments_df)
test_vectors = infer_vectors(d2v_model, tokenized_test_comments, "")

#store the test labels
test_labels = test_ds['toxicity_level']

Inferring 5329 comments into doc2vec vectors.
Inferring 5329 comments into doc2vec vectors.
Created 5329 doc2vec vectors.
Inferring 1669 comments into doc2vec vectors.
Created 1669 doc2vec vectors.


### Methods for Custom Rocchio Classifier

Below are methods to use the Rocchio Classifier.

In [37]:
# helper function
def get_number_of_classes(classes):
    return len(np.unique(classes))

def compute_tf_idf_matrix(trainingData):
    # compute term frequencies to get an idea of their distributions across the corpus.
    termFreq = trainingData.sum(axis=1)
    # print("termFreq:" + str(termFreq))

    # computer the doc frequency, the number of docs in which the term appears divided by total number of docs
    # doc counts for each term
    docFreq = pd.DataFrame([(trainingData != 0).sum(1)]).T  # if TD!=0 add 1 and sum it up across
    # print("docFreq:" + str(docFreq))

    # Create a matrix with all entries = number of docs
    numDocs = trainingData.shape[1]
    newMatrix = np.ones(np.shape(trainingData), dtype=float) * numDocs
    np.set_printoptions(precision=2, suppress=True, linewidth=120)
    # print("newMatrix" + str(newMatrix))

    # Convert each entry into IDF values
    # IDF is the log of the inverse of document frequency
    # Note that IDF is only a function of the term, so all columns will be identical.
    idf = np.log2(np.divide(newMatrix, np.array(docFreq)))
    # print("idf" + str(idf))

    # Finally compute the TFxIDF values for each document-term entry
    tf_idf = trainingData * idf
    pd.set_option("display.precision", 2)
    # print("tf_idf" + str(tf_idf))
    return tf_idf

"""A Training Function for the Rocchio Classification Algorithm
:param data: an np array of the training data formatted as a doc-term frequency matrix,
a ttd*idf matrix or else a doc2vec vector matrix
:param labels: an np array of the corresponding labels for the data
:returns: the prototype vectors for each class (a.k.a. label)
"""
def rocchio_train(train, labels):
    # the prototype will be a dictionary with unique class labels as keys and on dimensional arrays representing
    # the prototype.
    prototype = {}
    #tf_idf_train = compute_tf_idf_matrix(train)
    num_labels = get_number_of_classes(labels)
    print("The number of classes:" + str(num_labels))

    # add each class to the prototype dictionary
    # the class is the key
    # a one dimensional array equal to the length of a term vector array is initialized with all 0s
    for label in labels:
        term_vector_array = np.zeros(np.shape(len(train[0])), dtype=float)
        prototype[label] = term_vector_array

    length = len(train)
    for i in range(length):
        # first figure out which class this entry belongs to
        label = labels[i]

        # next sum up the entire document term vector
        prototype[label] = prototype.get(label) + np.array(train[i])

    return prototype

"""A Classifier Function for the Rocchio Classification Algorithm
:param data: a dictionary of prototype vectors where the class is a key and the value is the vector
for the corresponding class.
:param instance: the test instance to classify
:returns: the predicted class
"""
def rocchio_classifier(prototype, test_instance):
    m = -2
    predicted_class = -1  # intialize the predicted class to -1 by default
    for classLabel in prototype:
        # (compute similarity to prototype vector)
        # use cosine similarity
        # cosine distance is 1 - (dot product / L2 norm)
        term_vector = prototype[classLabel]
        prototype_norm = np.array([np.linalg.norm(term_vector) for i in range(len(term_vector))])
        test_instance_norm = np.linalg.norm(test_instance)

        sims = np.dot(term_vector, test_instance) / (prototype_norm * test_instance_norm)

        # get the maximum cosSim
        max_cos_sim = np.max(sims)

        if max_cos_sim > m:
            m = max_cos_sim
            predicted_class = classLabel

    return predicted_class

"""An Evaluation Function for the Rocchio Classification Algorithm
:param test_data: an np array of the test data formatted as a doc-term frequency matrix,
a ttd*idf matrix or else a doc2vec vector matrix
:param test_labels: an np array of the corresponding labels for the data
:param prototype: a dictionary of prototype vectors to use for classification
:returns: the accuracy of the model
"""
def rocchio_evaluate(test_data, test_labels, prototype):
    num_correct = 0

    # iterate through the test instances and call the rocchio classifier
    test_data_len = len(test_data)
    for i in range(test_data_len):
        predicted_label = rocchio_classifier(prototype, test_data[i])
        test_label = test_labels[i]
        print("Predicted Label:" + str(predicted_label) + " " + "Test Label:" + str(test_labels[i]))
        if predicted_label == test_label:
            num_correct = num_correct + 1
    accuracy = (num_correct / test_data_len * 100.0)
    print("Rocchio classifier - Number of test instances classified correctly:" + str(
        num_correct) + " Percent Accuracy: " + str(accuracy))

### Running Custom Rochio Implementation using Doc2Vec vectors

In [38]:
# create np arrays for inputs
train_vectors = np.array(train_vectors)
train_labels = np.array(train_ds['toxicity_level'])

# generate the prototype vectors
prototype_vectors = rocchio_train(train_vectors, train_labels)

#classify test data using prototype vectors
test_vectors = np.array(test_vectors)
test_labels = np.array(test_labels)
rocchio_evaluate(test_vectors, test_labels, prototype_vectors)

The number of classes:4
Predicted Label:4 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:2 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:3 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:2 Test Label:1
Predicted Label:2 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:4 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:4 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:3 Test Label:1
Predicted Label:2 Test Label:1
Predicted Label:4 Test Label:1
Predicted Label:1 Test Label:1
Predicted Label:3 Test Label:1
Predicted Label:4 Test Label:1
Predicted Label

### Running Nearest Centroid from sklearn using Doc2Vec vectors

### Methods for Nearest Centroid

In [39]:
from sklearn.neighbors import NearestCentroid
"""An Classifier Function for the Nearest Centroid Algorithm
:param train: an np array of the train data formatted as a doc-term frequency matrix,
a ttd*idf matrix or else a doc2vec vector matrix
:param train: an np array of the corresponding labels for the train data
:param test_data: an np array of the test data formatted as a doc-term frequency matrix,
a ttd*idf matrix or else a doc2vec vector matrix
:param test_labels: an np array of the corresponding labels for the test data
:returns: the accuracy of the model
"""
def rocchio_classifier_nearest_centroid(train, train_labels, test_data, test_labels):
    clf = NearestCentroid()
    clf.fit(train, train_labels)
    predictions = clf.predict(test_data)

    numCorrect = 0
    test_data_len = len(test_data)
    for i in range(test_data_len):
        if predictions[i] == test_labels[i]:
            numCorrect = numCorrect + 1

    accuracy = (numCorrect / test_data_len * 100.0)
    print("Nearest Centroid classifier - Number of test instances classified correctly:" + str(
        numCorrect) + " Percent Accuracy: " + str(accuracy))


In [40]:
rocchio_classifier_nearest_centroid(train_vectors, train_labels, test_vectors, test_labels)

Nearest Centroid classifier - Number of test instances classified correctly:800 Percent Accuracy: 47.93289394847214
