In [1]:
import os
import re

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.spatial.distance import mahalanobis
from scipy.stats import entropy

In [2]:
import sys
sys.path.append('../../')

In [3]:
from liwc_utils.distances import Euclidean, Cosine, Mahalanobis, JSDivergence

In [4]:
np.set_printoptions(suppress=True)

In [5]:
LIWC_NPZ_DIR = '/home/sjb/Projects/Research/LinkedIn_OB/data/word_features/company_level_liwc/raw_vectors/'

In [6]:
SAMPLE_FILE = 'jpmorgan-chase_raw_liwc.npz'

In [7]:
raw_liwc_data = np.load(os.path.join(LIWC_NPZ_DIR, SAMPLE_FILE), allow_pickle=True)

In [8]:
person_vectors = raw_liwc_data['person_vectors']
company_vector = raw_liwc_data['company_vector']

In [9]:
def calc_euclidean_dist(array_of_vectors, compare_vector):
    dist = np.linalg.norm(array_of_vectors - compare_vector, axis=1)
    
    return dist

def calc_cosine_dist(array_of_vectors, compare_vector):
    """Define cosine distance as 1 - cosine similarity"""
    cosine_sim_num = array_of_vectors.dot(compare_vector)
    cosine_sim_denom = np.linalg.norm(array_of_vectors, axis=1) * np.linalg.norm(compare_vector)
    cosine_similarity = cosine_sim_num / cosine_sim_denom
    dist = 1 - cosine_similarity
    
    return dist
    
def calc_mahalanobis_dist(array_of_vectors, compare_vector):
    V = np.cov(array_of_vectors, rowvar=False)
    VI = np.linalg.inv(V)
    
    n = array_of_vectors.shape[0]
    dist = np.zeros(n)
    
    for i in range(n):
        dist[i] = mahalanobis(array_of_vectors[i], compare_vector, VI)
        
    return dist

# https://stats.stackexchange.com/questions/7630/clustering-should-i-use-the-jensen-shannon-divergence-or-its-square
def calc_js_divergence(array_of_vectors, compare_vector):
    array_of_vectors_normalized = (array_of_vectors / array_of_vectors.sum(axis=1).reshape(-1, 1))
    compare_vector_normalized = compare_vector / compare_vector.sum()
    M = 0.5 * (array_of_vectors_normalized + compare_vector_normalized)
    
    n = array_of_vectors.shape[0]
    dist = np.zeros(n)
    
    for i in range(n):
        # base 2 is following the convention in "Trajectories" paper
        kl_1 = entropy(array_of_vectors_normalized[i], M[i], base=2)
        kl_2 = entropy(compare_vector_normalized, M[i], base=2)
        dist[i] = 0.5 * (kl_1 + kl_2)
        
    return dist


In [19]:
list(raw_liwc_data.keys())

['company_person_ids', 'person_vectors', 'company_vector']

In [10]:
dist_fns = [
    calc_euclidean_dist,
    calc_cosine_dist,
    calc_mahalanobis_dist,
    calc_js_divergence,
]

In [11]:
distances = []

for dist_fn in dist_fns:
    distances.append(dist_fn(person_vectors, company_vector).reshape(-1, 1))
    

In [12]:
distances_1 = np.hstack(distances)

In [13]:
dist_fns_2 = [
    Euclidean,
    Cosine,
    Mahalanobis,
    JSDivergence,
]

In [14]:
distances_new = []

for dist_fn in dist_fns_2:
    dist_metric = dist_fn(person_vectors, company_vector, standardize=False)
    distances_new.append(dist_metric.calc_distance().reshape(-1, 1))

In [16]:
distances_2 = np.hstack(distances_new)

In [18]:
np.abs(distances_1 - distances_2).max()

0.0

In [None]:
temp = calc_js_divergence(person_vectors, company_vector)

In [None]:
temp.min()

In [None]:
temp = entropy(person_vectors[0], company_vector, base=2)

In [None]:
temp

In [None]:
numerator = person_vectors[0]+1e-4
denom = company_vector + 1e-4

numerator = numerator / numerator.sum()
denom = denom / denom.sum()

In [None]:
(numerator * np.log(numerator / denom) / np.log(2)).sum()

In [None]:
entropy(person_vectors[0], company_vector, 2)

In [None]:
np.log((person_vectors[0]+1e-6) / (company_vector + 1e-6)) #/ np.log(2)

In [None]:
(person_vectors / person_vectors.sum(axis=1).reshape(-1, 1)).sum(axis=1)

In [None]:
(company_vector / company_vector.sum()).sum()

In [None]:
a, b = calc_js_divergence(person_vectors, company_vector)

In [None]:
V = np.cov(np.array([array_1, array_2]).T)
IV = np.linalg.inv(V)
print(mahalanobis(array_1, array_2, IV))

In [None]:
euclidean_distances = calc_euclidean_dist(person_vectors, company_vector)

In [None]:
V = np.cov(person_vectors, rowvar=False)
IV = np.linalg.inv(V)

In [None]:
mahalanobis(person_vectors[0], company_vector, IV)

In [None]:
temp = calc_mahalanobis_dist(person_vectors, company_vector)

In [None]:
temp1 = calc_euclidean_dist(person_vectors, company_vector)

In [None]:
temp1

In [None]:
temp

In [None]:
cov = np.cov(
    person_vectors, rowvar=False)

In [None]:
cov.shape

In [None]:
N = person_vectors.shape[0]

In [None]:
mean = person_vectors.mean(axis=0)

In [None]:
demeaned = person_vectors - mean

In [None]:
cov2 = (1 / (N - 1)) * (demeaned.T).dot(demeaned)

In [None]:
cov2.shape

In [None]:
(cov - cov2).min()

In [None]:
cov2