# A Toxic Comment Identifier Application - Doc2Vec Data Transformation

### Public Methods
Below are public methods which can be called to generate vectors for a dataframe that is passed in.

In [23]:
from gensim.models import doc2vec
from nltk.tokenize import word_tokenize
from numpy import savetxt
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import GridSearchCV

In [24]:
"""
A method to get a list of doc2vec vectors based on the model and dataframe passed in
The dataframe should just consist of 1 column which contains the document or comment
to be vectorized
:param model : a doc2vec model that has already been initialized and defined
:param data : the dataframe which contains the document or comment to be vectorized
:return a list of vectors corresponding to the data passed in
"""
def get_doc2vec_vectors(model, data):
    # #using default values for now
    tokenized_comments = tokenize_comments(data)
    tagged_documents = get_tagged_documents(tokenized_comments)

    # build the vocabulary
    # input a list of documents
    model.build_vocab(x for x in tagged_documents)

    # Train the model
    model.train(tagged_documents, total_examples = model.corpus_count, epochs = model.epochs)

    #print("Inferring "+str(len(tokenized_comments)) +" comments into doc2vec vectors.")
    vectors = infer_vectors(model, tokenized_comments, "")
    return vectors

"""
A method that infers a list of vectors from a trained Doc2Vec model
: param model : a Doc2Vec model which is already trained with vocab built
: param input : a data frame to infer Doc2Vec vectors from
: param save_file_name [OPTIONAL] : If a string is provided,
the vectors will be saved using this file name.
"""
def infer_vectors(model, tokenized_comments, save_file_name):
    #print("Inferring "+str(len(tokenized_comments)) +" comments into doc2vec vectors.")
    vectors = []
    for comment in tokenized_comments:
        #count = count + 1
        #print("Vectorizing: "+str(count)+" comment.")
        vectors.append(model.infer_vector(comment))

    #print("Created "+str(len(vectors)) + " doc2vec vectors.")
    #save to file if a file name is present
    if save_file_name != "":
        print("Saving vectors to file: " + str(save_file_name))
        savetxt(save_file_name, vectors)

    return vectors

### Helper Methods

Below are helper methods used by the public methods above.

In [25]:
"""
A function to tokenize all data in a dataframe
:param data: a dataframe containing comments to tokenize
"""

def tokenize_comments(dataframe):
    data = []
    for row in dataframe:
        data.append(tokenize_each_comment(row))
    return data
"""
A function to tokenize a single comment
:param data: a single comment to tokenize
"""
def tokenize_each_comment(comment):
    temp = []
    for j in word_tokenize(comment):
        temp.append(j)
    return temp

"""
A function to generate a list of tagged documents to train a
Doc2Vec model
:param list_of_tokenized_comments: A list of tokenized comments
"""
def tagged_document(list_of_tokenized_comments):
  for x, ListOfWords in enumerate(list_of_tokenized_comments):
    yield doc2vec.TaggedDocument(ListOfWords, [x])

"""
A function to get tagged documents from
a list of tokenized comments
"""
def get_tagged_documents(list_of_tokenized_comments):
    return list(tagged_document(list_of_tokenized_comments))


### Hyperparameter Tuning of Doc2Vec Model

To tune the parameters that we can customize for a Doc2Vec model, I used the NearestCentroid classifier as a model to validate the parameters.  I decided to create a custom method to tune the Doc2Vec parameters.  First, I create a dictionary of the parameters I want to tune.



In [26]:
import itertools
dm = [0,1]
vector_size = [500, 1000]
window = [2,5]
hs = [1]
paramsList = [{'dm': item[0],
               'vector_size': item[1],
               'window': item[2],
               'hs': 1,
               'negative': 0
               } for item in
                 list(itertools.product(*[dm,
                                          vector_size,
                                          window,
                                          hs]))
              ]

print(paramsList)

[{'dm': 0, 'vector_size': 500, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 0, 'vector_size': 500, 'window': 5, 'hs': 1, 'negative': 0}, {'dm': 0, 'vector_size': 1000, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 0, 'vector_size': 1000, 'window': 5, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 500, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 500, 'window': 5, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 1000, 'window': 2, 'hs': 1, 'negative': 0}, {'dm': 1, 'vector_size': 1000, 'window': 5, 'hs': 1, 'negative': 0}]


In [27]:
#%run Toxic_App_Rochhio_Classifier.ipynb

file_object = open('clean_data1.p', 'rb')
clean_data = pickle.load(file_object)
train_ds = clean_data[0]
test_ds = clean_data[1]

train_comments_df = train_ds['comment_text']
test_comments_df = test_ds['comment_text']
train_labels_df = train_ds['toxicity_level']
test_labels_df = test_ds['toxicity_level']

def evaluateDoc2VecParams():
    # Tag docs
    train_tokenized_comments = tokenize_comments(train_comments_df)
    train_tagged_documents = get_tagged_documents(train_tokenized_comments)
    scoreList = []
    for param in paramsList:
      print("Evaluating "+str(param))
      try:
        d2v_model = doc2vec.Doc2Vec(train_tagged_documents,
                        dm=param['dm'],
                        vector_size=param['vector_size'],
                        window=param['window'],
                        min_count=1,
                        epochs=10,
                        hs=param['hs'],
                        seed=516)
        train_vectors = get_doc2vec_vectors(d2v_model,train_comments_df)

        tokenized_test_comments = tokenize_comments(test_comments_df)
        test_vectors = infer_vectors(d2v_model, tokenized_test_comments, "")

        train_labels = np.array(train_labels_df)

        #classify test data using prototype vectors
        test_labels = np.array(test_labels_df)

        rocchio_classifier_nearest_centroid(train_vectors, train_labels, test_vectors, test_labels)
        # generate the prototype vectors
        #prototype_vectors = rocchio_train(train_vectors, train_labels)

        #classify test data using prototype vectors
        #rocchio_evaluate(test_vectors, test_labels, prototype_vectors)
      except Exception as error:
        print(f'Cannot evaluate model with parameters {param} because of error: {error}')
        continue
    return scoreList

#evaluateDoc2VecParams()


In [28]:
#Token
min_DF = [1, 3, 10]
max_DF = [0.6, 0.8, 0.9]
#norm = ['l1','l2']

#Dimensionality Reduction
n_components = [115]

#Classifer
metric = ['euclidian', 'cosine']
classes = ['1','2','3']

rocchio_params =[
    {
        # 'token_value': [CountVectorizer(), TfidfVectorizer()],
        # 'token_value__min_df': min_DF,
        # 'token_value__max_df': max_DF,
        'reduce_dim': ['passthrough'],
        'clf__metric': metric
    }
    # {
    #     'token_value': [CountVectorizer(), TfidfVectorizer()],
    #     'token_value__min_df': min_DF,
    #     'token_value__max_df': max_DF,
    #     'reduce_dim': [TruncatedSVD()],
    #     'reduce_dim__n_components': n_components,
    #     'clf__metric': metric
    # }
]

In [29]:
pipe_rocchio = Pipeline(
    [
        ('token_value', 'passthrough'),
        ('reduce_dim', 'passthrough'),
        ('clf', NearestCentroid())
    ]
)

In [30]:
def run_on_subset(pipe_params, subset):
    'perform the pipeline(transform corpus to matrix and use to make model, use on test data) using the parameter dictionary on the data'

    #set variables from input
    pipe, parameters = pipe_params
    train_data, train_labels, test_data, test_labels = subset

    def gridSearch(pipe, parameters):
        'runs the grid search CV function with input pipe and parameters'

        grid_search = GridSearchCV(
            estimator = pipe,
            param_grid = parameters,
            n_jobs = 2,
        )
        return grid_search

    #set grid Search CV with input pipe and parameters
    grid_search = gridSearch(pipe, parameters)
    #fit the data to the model through the pipe and randomized CV
    grid_search.fit(train_data,train_labels)

    best_parameters = grid_search.best_estimator_.get_params()
    print('Pipeline = {}'.format(pipe))

    #formating print statments
    print('Best parameters = ')
    bestparams = []
    if type(parameters) is list:
        for param_dict in parameters:
            if type(param_dict) is dict:
                for param_name in sorted(param_dict.keys()):
                    if param_name in best_parameters.keys():
                        x = str(param_name) + ': ' + str(best_parameters[param_name])
                        if x not in bestparams:
                            bestparams.append(x)
        print(*bestparams, sep = '\n')
                #print(f"{param_name}: {best_parameters[param_name]}")
    else:
        for param in sorted(parameters.keys()):
            print(f"{param}: {best_parameters[param]}")


    test_accuracy = grid_search.score(test_data, test_labels)
    best_estimator = grid_search.best_estimator_
    print(
        "Accuracy of the best parameters using the inner CV of "
        f"the random search: {grid_search.best_score_:.3f}"
    )
    print(f"Accuracy on test set: {test_accuracy:.3f}")
    return best_estimator

In [31]:
#[pipe_rocchio, rocchio_params]
#[train_df_subset1, train_lab_subset1, test_df_subset1, test_lab_subset1]
#train_comments_df = train_ds['comment_text']
#test_comments_df = test_ds['comment_text']
#train_labels_df = train_ds['toxicity_level']
#test_labels_df = test_ds['toxicity_level']

#rocchio_best = run_on_subset([pipe_rocchio, rocchio_params], [train_comments_df, train_labels_df, test_comments_df, test_labels_df])

10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lisasaurus01/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lisasaurus01/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/lisasaurus01/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_nearest_centroid.py", line 127, in fit
    X, y = self._validate_data(X, y, accept_sparse=["csr", "csc"])
  File "/Users/lisasaurus01/opt/a

ValueError: could not convert string to float: 'sorry break unsourced film plot much error unsourced interpretation lyric editor would quite within right remove require verifiable source'