In [1]:
import itertools
import csv
import os, sys
import gensim
import json
import bz2
import gensim.models.keyedvectors as kv
from gensim.models import KeyedVectors

In [2]:
rows=list()
# open Harvard Inquirer
with open('../lexicons/Harvard_Inquirer-inqtabs.txt', 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for i,row in enumerate(reader):
        if i == 0:
            header = row
        else:
            rows.append(row)

# extract words of interest
# categories = ["Pleasur","Pain","Feel","Arousal", "Polit@", "Relig"]
inquirer = dict()

for c in header:
   
    # extract index from header
    idx = header.index(c)
   
    # extract words and drop to lowercase
    inquirer[c] = [w[0].lower() for w in rows if w[idx] != '']
   
    # remove alternate meanings and reduce duplicates
    inquirer[c] = list(set([w.split("#")[0] for w in inquirer[c]]))

In [11]:
# create dictionary of terms with category as value
all_terms = dict()
for c in inquirer.keys():
    for w in inquirer[c]:
        if c not in ["Othtags","Defined"]:
            all_terms[w] = c

In [None]:
genre_models=list()
input_data = ["../models/drama-17412.w2v.bz2",
             "../models/poetry-56817.w2v.bz2"]

for f in input_data:
    model_name = os.path.basename(f).split(".")[0]
    print("starting: {0}".format(model_name))
    model = kv.KeyedVectors.load_word2vec_format(bz2.open(f))
    genre_models.append([model_name,model])

In [None]:
outfilejson = "../tensors/tensors_labeled.json"

json_data = dict()
json_data['embeddings'] = list()

for m in genre_models:
    print("starting: {0}".format(m[0]))
    model_vocab = list(m[1].vocab)
    
    tensor_filename = "../tensors/" + m[0] 
    outfiletsv = tensor_filename + '_labels_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_labels_metadata.tsv'
        
    with open(outfiletsv, 'wt') as file_vector:
        with open(outfiletsvmeta, 'wt',encoding='utf-8') as file_metadata:
            file_metadata.write("Term\tCategory\n")
            for word in all_terms:
                if word in model_vocab:
                    file_metadata.write(word + "\t" + all_terms[word] + '\n')
                    vector_row = '\t'.join(str(x) for x in m[1][word])
                    file_vector.write(vector_row + '\n')
                
    shape = [1000,50]
    json_data['embeddings'].append({
        'tensorName': str(m[0]),
        'tensorPath': 'https://raw.githubusercontent.com/jeddobson/htrc-vector-project/master/tensors/' + m[0] + '_labels_tensor.tsv',
        'metadataPath': 'https://raw.githubusercontent.com/jeddobson/htrc-vector-project/master/tensors/' + m[0] + '_labels_metadata.tsv',
        'tensorShape' : shape
    })

In [None]:
with open(outfilejson, 'wt') as f:
    json.dump(json_data, f)