In [3]:
import itertools
import csv
import os, sys
import gensim
import json
import bz2
import gensim.models.keyedvectors as kv
from gensim.models import KeyedVectors

In [4]:
rows=list()
# open Harvard Inquirer
with open('../lexicons/Harvard_Inquirer-inqtabs.txt', 'rt') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for i,row in enumerate(reader):
        if i == 0:
            header = row
        else:
            rows.append(row)

# extract words of interest
categories = ["Pleasur","Pain","Feel","Arousal", "Polit@", "Relig"]
inquirer = dict()
for c in categories:
   
    # extract index from header
    idx = header.index(c)
   
    # extract words and drop to lowercase
    inquirer[c] = [w[0].lower() for w in rows if w[idx] != '']
   
    # remove alternate meanings and reduce duplicates
    inquirer[c] = list(set([w.split("#")[0] for w in inquirer[c]]))

In [7]:
pleasure_terms = inquirer['Pleasur']
pain_terms = inquirer['Pain']
feel_terms = inquirer['Feel']
arousal_terms = inquirer['Arousal']
political_terms = inquirer['Polit@']
religious_terms = inquirer['Relig']

all_terms = list(itertools.chain(pleasure_terms, pain_terms, feel_terms, arousal_terms, political_terms, religious_terms))

In [8]:
eebo_models=list()
input_data = ["../models/drama-17412.w2v.bz2",
             "../models/poetry-56817.w2v.bz2"]

for f in input_data:
    model_name = os.path.basename(f).split(".")[0]
    print("starting: {0}".format(model_name))
    model = kv.KeyedVectors.load_word2vec_format(bz2.open(f))
    eebo_models.append([model_name,model])

starting: drama-17412
starting: poetry-56817


In [9]:
outfilejson = "../tensors/tensors_all.json"

json_data = dict()
json_data['embeddings'] = list()

# limit model
key_terms = ["religion","church","self","fate","reformation",
             "elect","protestant","chosen","faith",
             "predestination"]

for m in eebo_models:
    print(m[0])
    
    tensor_filename = "../tensors/" + m[0] 
    outfiletsv = tensor_filename + '_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_metadata.tsv'
    
    vectors=list()
    vocab=list()
    for t in key_terms:
        if t in m[1].vocab:
            nn = [x[0] for x in m[1].most_similar(t,topn=100)]
            for n in nn:
                vocab.append(n)
    vocab = set(vocab)
    
    with open(outfiletsv, 'wt') as file_vector:
        with open(outfiletsvmeta, 'wt',encoding='utf-8') as file_metadata:
            for word in vocab:
                file_metadata.write(word + '\n')
                vector_row = '\t'.join(str(x) for x in m[1][word])
                file_vector.write(vector_row + '\n')
                
    shape = [1000,50]
    json_data['embeddings'].append({
        'tensorName': str(m[0]),
        'tensorPath': 'https://raw.githubusercontent.com/jeddobson/htrc-vector-project/master/tensors/' + m[0] + '_tensor.tsv',
        'metadataPath': 'https://raw.githubusercontent.com/jeddobson/htrc-vector-project/master/tensors/' + m[0] + '_metadata.tsv',
        'tensorShape' : shape
    })

drama-17412
poetry-56817


In [10]:
with open(outfilejson, 'wt') as f:
    json.dump(json_data, f)