In [1]:
import numpy as np
from collections import defaultdict
import math
import random
import pickle
import argparse

import sys
sys.path.append('../')
from utils import commons
from utils import store
from utils import vector_utils

print_every = 100000
print_status = True

In [2]:
def compute_idf(data, min_count):
    """
    IDF is used to weight the term vectors.
    """
    if print_status:
        print('Computing IDF')
    counts = defaultdict(float)
    for line in data:
        line = set(line.split())
        for feature in line:
            counts[feature]+=1
    delete = [feature for feature in counts if counts[feature] < min_count]
    for feature in delete:
        del counts[feature]
    for feature in counts:
        counts[feature]=math.sqrt(len(data)/counts[feature])
    return counts

In [3]:
def initialize_vectors(features, idf, dim, seeds):
    """
    This creates the initial random projection for each feature. You create initial
    vector with dimensionality dim. Dim should be in the range 500-1000. You then
    select n (n is determined by seeds) elements and set the value to 1 or -1
    randomly. This performs the random projection.
    """
    vectors = {}

    for i in range(len(features)):
        if print_status and i % print_every == 0:
            print('Initializing ' + str(i))
        feature = features[i]
        vector=np.zeros(dim)
        sample=random.sample(range(0,dim),seeds) # Grab the n random elements for random projection
        for index in sample:
            vector[index]=random.choice([-1.0,1.0]) # Set each element to +1 or -1 for random projection
        vector=vector * idf[feature] # Weight based on IDF
        vectors[feature]=vector
    return vectors

In [4]:
def train_vectors(data, vectors):
    """
    For each feature in each line, add the feature to all other features. Conceptually,
    each co-occurance of two features moves the two features closer together.
    """
    trained_vectors=vectors.copy()
    for i in range(len(data)):
        if print_status and i % print_every == 0:
            print('Processed ' + str(i))
        line = data[i].split()
        line=[feature for feature in line if feature in vectors]
        for feature_1 in line:
            for feature_2 in line:
                if feature_1 != feature_2:
                    trained_vectors[feature_2]+=vectors[feature_1]
        for feature in trained_vectors:
            trained_vectors[feature] = vector_utils.normalize_vector(trained_vectors[feature])
        return trained_vectors