# Creates doc2vec models

## Disk Writing Classes

In [None]:
import json
import pandas as pd

import os

doc2vec_directory = "doc2vec_outputs/"

if not os.path.exists(doc2vec_directory):
    os.makedirs(doc2vec_directory)

    
class JSONStorage:
    """Dumps items into a json file"""
    def __init__(self, file_name: str):
        self.__file = open(file_name, "a")

    def close(self):
        self.__file.close()

    def add(self, item: dict):
        self.__file.write(json.dumps(item) + "\n")

## Corpus Iterable Class

These are classes that allow gensim to stream in the json files containing the ngram corpus of both training and testing data.

In [None]:
import json
import gensim

def prepare_ngrams(ngrams_list: list):
    for ngram in ngrams_list:
        yield "_".join(ngram)
        
        
class TestingCorpusIterator:
    """Treats this corpus as testing data, and returns a dictionary containing two keys: token_list (list of the ngrams) 
    and label (author id)"""
    def __init__(self, testing_corpus):
        self._corpus = testing_corpus
        
    def __next__(self):
        try:
            post = json.loads(self._corpus.next())
            token_list = list(prepare_ngrams(post[self._corpus.get_ngram_key()]))
            return {"token_list": token_list, "label": post['author_id']}
        except EOFError:
            # Catch and release to say that we're at the end
            self._corpus.reset()
            raise StopIteration
        
        
class TrainingCorpusIterator:
    """Treats this corpus as training data, and returns a tagged document for gensim"""
    def __init__(self, training_corpus):
        self._corpus = training_corpus
        
    def __next__(self):
        try:
            post = json.loads(self._corpus.next())
            token_list = list(prepare_ngrams(post[self._corpus.get_ngram_key()]))
            return gensim.models.doc2vec.TaggedDocument(token_list, post['author_id'])
        except EOFError:
            # Catch and release to say that we're at the end
            self._corpus.reset()
            raise StopIteration
        
        
class JSONCorpus:
    def __init__(self, file_name: str, ngram_key: str, token_only: bool):
        self.__file = open(file_name, "r")
        self.__token_only = token_only
        self.__ngram_key = ngram_key
        
        self.__line_num = 0
        
    def __iter__(self):
        if self.__token_only:
            return TestingCorpusIterator(self)
        else:
            return TrainingCorpusIterator(self)
    
    def next(self):
        print("Processing line", self.__line_num)
        self.__line_num += 1
        return next(self.__file)
    
    def get_ngram_key(self):
        return self.__ngram_key
    
    def reset(self):
        print("Finished processing.")
        self.__line_num = 0
        self.__file.seek(0)
                              
    def close(self):
        self.__file.close()

## doc2vec Unigram Characters Model

Creates and trains the doc2vec model using unigram characters.

In [None]:
training_corpus = JSONCorpus("jsons/train_unigram_chars.json", 'unigram_chars', False)
unigram_chars_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building unigram characters vocabulary...")
unigram_chars_model.build_vocab(training_corpus)
print("Training unigram characters doc2vec model...")
unigram_chars_model.train(training_corpus, total_examples=unigram_chars_model.corpus_count, epochs=unigram_chars_model.epochs)
training_corpus.close()

Infer vectors of the unigram characters test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_unigram_chars.json")
for test_post in JSONCorpus("jsons/test_unigram_chars.json", 'unigram_chars', True):
    vector = unigram_chars_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the unigram characters training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_unigram_chars.json")
for test_post in JSONCorpus("jsons/train_unigram_chars.json", 'unigram_chars', True):
    vector = unigram_chars_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Bigram Characters Model

Creates and trains the doc2vec model using bigram characters.

In [None]:
training_corpus = JSONCorpus("jsons/train_bigram_chars.json", 'bigram_chars', False)
bigram_chars_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building bigram characters vocabulary...")
bigram_chars_model.build_vocab(training_corpus)
print("Training bigram characters doc2vec model...")
bigram_chars_model.train(training_corpus, total_examples=bigram_chars_model.corpus_count, epochs=bigram_chars_model.epochs)
training_corpus.close()

Infer vectors of the bigram characters test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_bigram_chars.json")
for test_post in JSONCorpus("jsons/test_bigram_chars.json", 'bigram_chars', True):
    vector = bigram_chars_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the bigram characters training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_bigram_chars.json")
for test_post in JSONCorpus("jsons/train_bigram_chars.json", 'bigram_chars', True):
    vector = bigram_chars_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Trigram Characters Model

Creates and trains the doc2vec model using trigram characters.

In [None]:
training_corpus = JSONCorpus("jsons/train_trigram_chars.json", 'trigram_chars', False)
trigram_chars_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building trigram characters vocabulary...")
trigram_chars_model.build_vocab(training_corpus)
print("Training trigram characters doc2vec model...")
trigram_chars_model.train(training_corpus, total_examples=trigram_chars_model.corpus_count, epochs=trigram_chars_model.epochs)
training_corpus.close()

Infer vectors of the trigram characters test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_trigram_chars.json")
for test_post in JSONCorpus("jsons/test_trigram_chars.json", 'trigram_chars', True):
    vector = trigram_chars_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the trigram characters training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_trigram_chars.json")
for test_post in JSONCorpus("jsons/train_trigram_chars.json", 'trigram_chars', True):
    vector = trigram_chars_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Unigram Words Model

Creates and trains the doc2vec model using unigram words.

In [None]:
training_corpus = JSONCorpus("jsons/train_unigram_words.json", 'unigram_words', False)
unigram_words_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building unigram words vocabulary...")
unigram_words_model.build_vocab(training_corpus)
print("Training unigram words doc2vec model...")
unigram_words_model.train(training_corpus, total_examples=unigram_words_model.corpus_count, epochs=unigram_words_model.epochs)
training_corpus.close()

Infer vectors of the unigram words test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_unigram_words.json")
for test_post in JSONCorpus("jsons/test_unigram_words.json", 'unigram_words', True):
    vector = unigram_words_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the unigram words training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_unigram_words.json")
for test_post in JSONCorpus("jsons/train_unigram_words.json", 'unigram_words', True):
    vector = unigram_words_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Bigram Words Model

Creates and trains the doc2vec model using bigram words.

In [None]:
training_corpus = JSONCorpus("jsons/train_bigram_words.json", 'bigram_words', False)
bigram_words_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building bigram words vocabulary...")
bigram_words_model.build_vocab(training_corpus)
print("Training bigram words doc2vec model...")
bigram_words_model.train(training_corpus, total_examples=bigram_words_model.corpus_count, epochs=bigram_words_model.epochs)
training_corpus.close()

Infer vectors of the bigram words test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_bigram_words.json")
for test_post in JSONCorpus("jsons/test_bigram_words.json", 'bigram_words', True):
    vector = bigram_words_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the bigram words training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_bigram_words.json")
for test_post in JSONCorpus("jsons/train_bigram_words.json", 'bigram_words', True):
    vector = bigram_words_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Trigram Words Model

Creates and trains the doc2vec model using trigram words.

In [None]:
training_corpus = JSONCorpus("jsons/train_trigram_words.json", 'trigram_words', False)
trigram_words_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building trigram words vocabulary...")
trigram_words_model.build_vocab(training_corpus)
print("Training trigram words doc2vec model...")
trigram_words_model.train(training_corpus, total_examples=unigram_words_model.corpus_count, epochs=unigram_words_model.epochs)
training_corpus.close()

Infer vectors of the trigram words test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_trigram_words.json")
for test_post in JSONCorpus("jsons/test_trigram_words.json", 'trigram_words', True):
    vector = trigram_words_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the trigram words training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_trigram_words.json")
for test_post in JSONCorpus("jsons/train_trigram_words.json", 'trigram_words', True):
    vector = trigram_words_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Unigram POS Model

Creates and trains the doc2vec model using unigram POS.

In [None]:
training_corpus = JSONCorpus("jsons/train_unigram_pos.json", 'unigram_pos', False)
unigram_pos_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building unigram pos vocabulary...")
unigram_pos_model.build_vocab(training_corpus)
print("Training unigram pos doc2vec model...")
unigram_pos_model.train(training_corpus, total_examples=unigram_pos_model.corpus_count, epochs=unigram_pos_model.epochs)
training_corpus.close()

Infer vectors of the unigram POS test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_unigram_pos.json")
for test_post in JSONCorpus("jsons/test_unigram_pos.json", 'unigram_pos', True):
    vector = unigram_pos_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the unigram POS training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_unigram_pos.json")
for test_post in JSONCorpus("jsons/train_unigram_pos.json", 'unigram_pos', True):
    vector = unigram_pos_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Bigram POS Model

Creates and trains the doc2vec model using bigram POS.

In [None]:
training_corpus = JSONCorpus("jsons/train_bigram_pos.json", 'bigram_pos', False)
bigram_pos_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building bigram pos vocabulary...")
bigram_pos_model.build_vocab(training_corpus)
print("Training bigram pos doc2vec model...")
bigram_pos_model.train(training_corpus, total_examples=bigram_pos_model.corpus_count, epochs=bigram_pos_model.epochs)
training_corpus.close()

Infer vectors of the bigram POS test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_bigram_pos.json")
for test_post in JSONCorpus("jsons/test_bigram_pos.json", 'bigram_pos', True):
    vector = bigram_pos_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the bigram POS training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_bigram_pos.json")
for test_post in JSONCorpus("jsons/train_bigram_pos.json", 'bigram_pos', True):
    vector = bigram_pos_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

## doc2vec Trigram POS Model

Creates and trains the doc2vec model using trigram POS.

In [None]:
training_corpus = JSONCorpus("jsons/train_trigram_pos.json", 'trigram_pos', False)
trigram_pos_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
print("Building trigram pos vocabulary...")
trigram_pos_model.build_vocab(training_corpus)
print("Training trigram pos doc2vec model...")
trigram_pos_model.train(training_corpus, total_examples=trigram_pos_model.corpus_count, epochs=trigram_pos_model.epochs)
training_corpus.close()

Infer vectors of the trigram POS test corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_testing_trigram_pos.json")
for test_post in JSONCorpus("jsons/test_trigram_pos.json", 'trigram_pos', True):
    vector = trigram_pos_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()

Infer vectors of the trigram POS training corpus using the trained model.

In [None]:
json_output = JSONStorage(doc2vec_directory + "inferred_training_trigram_pos.json")
for test_post in JSONCorpus("jsons/train_trigram_pos.json", 'trigram_pos', True):
    vector = trigram_pos_model.infer_vector(test_post['token_list'])
    json_output.add({"author_id": test_post["label"], "output": pd.Series(vector).to_json(orient='values')})
    
json_output.close()