In [13]:
import sys
import time
import requests
from datetime import datetime, timedelta
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
import statsmodels.api as sm
import gensim.models.keyedvectors as word2vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.stats import pearsonr
import re

In [14]:
CLEANED_NAMES_FILE = "../Data/category_cleaned_names.csv"

# Initialize Model

In [15]:
w2v_model = word2vec.KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin', binary=True)
w2v_vocab = [key.lower() for key in w2v_model.vocab.keys()]

I0525 12:09:45.862390  8912 utils_any2vec.py:341] loading projection weights from ../Data/GoogleNews-vectors-negative300.bin
W0525 12:09:45.883334  8912 smart_open_lib.py:385] this function is deprecated, use smart_open.open instead
I0525 12:11:02.130850  8912 utils_any2vec.py:405] loaded (3000000, 300) matrix from ../Data/GoogleNews-vectors-negative300.bin


# Initialize She-He Axis

In [32]:
she_vec = w2v_model.get_vector('she')
he_vec = w2v_model.get_vector('he')
# she_vec = normalize(she_vec.reshape(1, -1))
# he_vec = normalize(he_vec.reshape(1, -1))
# he_she_axis = np.subtract(she_vec, he_vec)
she_vec = she_vec.reshape(1, -1)
he_vec = he_vec.reshape(1, -1)
he_she_axis = normalize(np.subtract(she_vec, he_vec))

In [33]:
cosine_similarity(she_vec, he_vec)

array([[0.612995]], dtype=float32)

# Get brands without duplicates

In [18]:
df_brands = pd.read_csv('../Data/brand_names_coded.csv')
df_brands_no_dup = df_brands.drop_duplicates(subset='BRAND')
all_brands = df_brands['BRAND'].tolist()
no_dup = df_brands_no_dup['BRAND'].tolist()
print("LEN ALL BRANDS", len(all_brands))
print("LEN ALL BRANDS NO DUPLICATED", len(no_dup))

# df_brands = df_brands[df_brands['BRAND'].isin(model_vocab)]
# print(df_brands.shape)
# df_brands['ALIGNMENT'] = df_brands['BRAND'].apply(lambda x: get_simple_alignment(x, he_she_axis))
# df_brands = df_brands.reset_index()
# df_brands = df_brands[['BRAND', 'CATEGORY', 'ALIGNMENT']]

LEN ALL BRANDS 2458
LEN ALL BRANDS NO DUPLICATED 2308


In [19]:
len(set(df_brands_no_dup['CATEGORY'].tolist()))

74

In [34]:
vecs = []
count = 0
valid_brands = []
for brand in no_dup:
    try:
        word_vec = w2v_model.get_vector(brand.capitalize())
        vecs.append(word_vec)
        count +=1
        valid_brands.append(brand)
    except KeyError:
        try:
            index = w2v_vocab.index(brand.lower())
        except ValueError:
            continue
        word_vec = w2v_model.get_vector(w2v_model.index2entity[index])
        vecs.append(word_vec)
        valid_brands.append(brand)
print("TOTAL NUMBER OF VECTORS", len(vecs))
print("TOTAL WITH CAPITALS", count)

TOTAL NUMBER OF VECTORS 1471
TOTAL WITH CAPITALS 1298


In [35]:
def get_simple_alignment(vectors, gender_subspace):
    vectors=normalize(vectors)
    print(vectors.shape)
    print(gender_subspace.shape)
    gen_comp = np.dot(vectors, gender_subspace)
#     if gen_comp < 0:
#         return 'M'
#     else:
#         return 'F'
    return gen_comp

In [36]:
alignment = get_simple_alignment(vecs, he_she_axis[0])


(1471, 300)
(300,)


In [23]:
v = normalize([[1, 0, 1], [5, 4, 3]])
np.dot(v[1], v[1])

0.9999999999999998

In [11]:
df_alignment = pd.DataFrame({'brand': valid_brands, 'alignment': alignment})
df_alignment = df_alignment.sort_values(by='alignment')
df_alignment.to_csv('../Data/alignment_scores.csv')

# Correlation of normalized and unnormalized gender subspace results

In [37]:
she_vec_unnorm = normalize(she_vec.reshape(1, -1))
he_vec_unnorm = normalize(he_vec.reshape(1, -1))
he_she_axis_unnorm = np.subtract(she_vec_unnorm, he_vec_unnorm)
she_vec = she_vec.reshape(1, -1)
he_vec = he_vec.reshape(1, -1)
he_she_axis = normalize(np.subtract(she_vec, he_vec))

In [38]:
alignment_unnorm = get_simple_alignment(vecs, he_she_axis_unnorm[0])
alignment = get_simple_alignment(vecs, he_she_axis[0])
pearsonr(alignment_unnorm, alignment)

(1471, 300)
(300,)
(1471, 300)
(300,)


(0.9812711299009028, 0.0)

# Category Embeddings

In [39]:
def preprocess(category):
    category = category[5:]
    category = re.sub(",|'s|& |/", ' ', category)
    category = re.sub("  ", " ", category )
    return category.lower()

In [40]:
def get_brand_categories(brands, concise=False):
    if concise:
        cleaned_name_map = pd.read_csv(CLEANED_NAMES_FILE).set_index('Original').to_dict()['Concise']
    else:
        cleaned_name_map = pd.read_csv(CLEANED_NAMES_FILE).set_index('Original').to_dict()['Corrected']
    valid_df = df_brands_no_dup[df_brands_no_dup['BRAND'].isin(brands)]
    brand_category_mapping = valid_df.set_index('BRAND').to_dict()['CATEGORY']
    brand_category_cleaned = {brand: preprocess(cleaned_name_map[category]) for brand, category in brand_category_mapping.items()}
#     brand_category_mapping = {brand: preprocess(category) for brand, category in brand_category_cleaned.items()}
    return brand_category_cleaned, set(brand_category_cleaned.values())


    
    

In [41]:
def get_category_vectors(category):
    words = category.split(" ")
    cat_vecs = np.array([])
    for word in words:
        if word != '':
            index = w2v_vocab.index(word)
            word_vec = w2v_model.get_vector(w2v_model.index2entity[index])
            if cat_vecs.shape[0] == 0:
                cat_vecs = word_vec
            else:
                cat_vecs = np.vstack([cat_vecs, word_vec])
        else:
            print(category)
            print("IT SLIPPED THROUGH")
#     print(np.mean(cat_vecs, axis=0).shape)
#     print(cat_vecs)
    cat_vecs = cat_vecs.reshape(-1, 300)
    return np.mean(cat_vecs, axis=0)



# Project Categories onto Gender Subspace

In [52]:
def get_category_projections(valid_brands, concise=False):
    brand_category_mapping, categories = get_brand_categories(valid_brands, concise)
    categories = list(categories)
    category_vec_map = {}
    category_vectors = np.array([])
    
    for category in categories:
        new_vec = get_category_vectors(category)
        category_vec_map[category] = new_vec
        if category_vectors.shape[0] == 0:
            category_vectors = category_vec_map[category]
        else:
            new_vec = category_vec_map[category].reshape(1, -1)
            category_vectors = np.vstack([category_vectors, new_vec])
    
    category_alignment = get_simple_alignment(category_vectors, he_she_axis[0])
    cat_al = pd.DataFrame({'category': list(categories), 'alignment': list(map(np.round, category_alignment, [3]*len(category_alignment)))})
    cat_al['gender'] = cat_al['alignment'].apply(lambda x: 'F' if x >=0 else 'M')
    cat_al.sort_values(by='category').to_csv('../Data/word2vec_category_alignment.csv')
    print("TOTAL NUMBER OF VECTORS", len(category_vectors))
    print("LENGTH OF ALIGNMENT", len(alignment[alignment>0]))
    print("LENGTH OF RESIDUAL ALIGNMENT", len(category_alignment[category_alignment>0]))
    print(len(category_alignment))
    print(len(alignment[alignment>0])/len(alignment))
    print(len(category_alignment[category_alignment>0])/len(category_alignment))
    return category_vectors, categories
    
    
    
category_vectors, categories = get_category_projections(valid_brands, False)

(72, 300)
(300,)
TOTAL NUMBER OF VECTORS 72
LENGTH OF ALIGNMENT 1089
LENGTH OF RESIDUAL ALIGNMENT 63
72
0.7403127124405167
0.875


# Intuition Test to Ensure Similarity

In [18]:
def get_k_most_similar(cat_vecs, categories, index, k):
    categories = np.array(categories)
    query = categories[index]
    query_vector = cat_vecs[index]
    print('CATEGORY: ',  query)
    all_sims = cosine_similarity(cat_vecs, query_vector.reshape(1, -1))
    mask = np.argsort(all_sims, axis=0)[-k-1:-1][::-1]
    print(categories[mask.flatten()])
    return categories[mask.flatten()], all_sims[mask].flatten()

cat_index = np.array([])
sim_index = np.array([])
sim_values = np.array([])
k = 3
for i in range(len(categories)):
    k_closest_cats, k_closest_sims = get_k_most_similar(category_vectors, categories, i, k)
    cat_index = np.append(cat_index, np.repeat(categories[i], k))
    sim_index = np.append(sim_index, k_closest_cats)
    sim_values = np.append(sim_values, k_closest_sims)
    

CATEGORY:  sunglasses
['sneakers' 'women shoes boots' 'handbags']
CATEGORY:  women shaving supplies
['mens shaving equipment supplies' 'deodorants antiperspirants'
 'skin care products']
CATEGORY:  religious charitable humanitarian
['sporting goods' 'women shaving supplies' 'employment agencies']
CATEGORY:  employment agencies
['insurance agencies brokers' 'investment products services'
 'internet service providers']
CATEGORY:  face makeup
['paint' 'skin care products' 'sunglasses']
CATEGORY:  artificial sweeteners
['cocoa milk additives' 'ice cream frozen novelties sherbet'
 'nutritional supplements']
CATEGORY:  cooking oils
['pasta pasta dishes' 'dishwashing detergents' 'cocoa milk additives']
CATEGORY:  credit cards
['loan credit products services'
 'stationery greeting cards miscellaneous paper goods'
 'household paper products']
CATEGORY:  electronic video games software
['desktop computers' 'cable television providers systems'
 'web design domain hosting services']
CATEGORY:  spo

 'snow skis boards blades accessories']
CATEGORY:  sneakers
['women shoes boots' 'sunglasses' 'sportswear athleticwear']


In [19]:
df_cat_sim = pd.DataFrame(sim_values, index=[cat_index, sim_index])
df_cat_sim.to_csv("../Data/category_similarities.csv")

# Subtract Category Embeddings from Brands and Repeat Gender Analysis

In [20]:
def main(concise=False):
    brand_category_mapping, categories = get_brand_categories(valid_brands, concise)
    categories = list(categories)
    category_vec_map = {}
    for category in categories:
        new_vec = get_category_vectors(category)
        category_vec_map[category] = new_vec

    residual_vecs = []
    residual_count = 0
    residual_valid_brands = []
    for brand in no_dup:
        try:
            residual_word_vec = w2v_model.get_vector(brand.capitalize())
            residual_word_vec = np.subtract(residual_word_vec, category_vec_map[brand_category_mapping[brand]])
            residual_vecs.append(residual_word_vec)
            residual_count +=1
            residual_valid_brands.append(brand)
        except KeyError:
            try:
                residual_index = w2v_vocab.index(brand.lower())
            except ValueError:
                continue
            residual_word_vec = w2v_model.get_vector(w2v_model.index2entity[residual_index])
            residual_word_vec = np.subtract(residual_word_vec, category_vec_map[brand_category_mapping[brand]])
            residual_vecs.append(residual_word_vec)
            residual_valid_brands.append(brand)
    residual_alignment = get_simple_alignment(residual_vecs, he_she_axis[0])
    print("TOTAL NUMBER OF VECTORS", len(residual_vecs))
    print("TOTAL WITH CAPITALS", residual_count)
    print("LENGTH OF ALIGNMENT", len(alignment[alignment>0]))
    print("LENGTH OF RESIDUAL ALIGNMENT", len(residual_alignment[residual_alignment>0]))
    print(len(residual_alignment))
    print(len(alignment[alignment>0])/len(alignment))
    print(len(residual_alignment[residual_alignment>0])/len(residual_alignment))
    residual_df_alignment = pd.DataFrame({'brand': residual_valid_brands, 'residual_alignment': residual_alignment, 'alignment': alignment})
    residual_df_alignment = residual_df_alignment.sort_values(by='residual_alignment')
    residual_df_alignment.to_csv('../Data/residual_alignment_scores.csv')


In [21]:
main()

TOTAL NUMBER OF VECTORS 1471
TOTAL WITH CAPITALS 1298
LENGTH OF ALIGNMENT 1089
LENGTH OF RESIDUAL ALIGNMENT 683
1471
0.7403127124405167
0.46430999320190347


In [24]:
main(True)

TOTAL NUMBER OF VECTORS 1471
TOTAL WITH CAPITALS 1298
LENGTH OF ALIGNMENT 1089
LENGTH OF RESIDUAL ALIGNMENT 729
1471
0.7403127124405167
0.495581237253569


In [25]:
residual_alignment = get_simple_alignment(residual_vecs, he_she_axis[0])
residual_df_alignment = pd.DataFrame({'brand': residual_valid_brands, 'alignment': alignment, 'residual_alignment': residual_alignment})
residual_df_alignment = residual_df_alignment.sort_values(by='residual_alignment')
residual_df_alignment.to_csv('../Data/residual_alignment_scores.csv')

NameError: name 'residual_vecs' is not defined

In [None]:
print("LENGTH OF ALIGNMENT", len(alignment[alignment>0]))
print("LENGTH OF RESIDUAL ALIGNMENT", len(residual_alignment[residual_alignment>0]))
print(len(residual_alignment))
print(len(alignment[alignment>0])/len(alignment))
print(len(residual_alignment[residual_alignment>0])/len(residual_alignment))

In [None]:
x = df_brands_no_dup[df_brands_no_dup['BRAND'].isin(valid_brands)]
y = x.set_index('BRAND').to_dict()['CATEGORY']
x.drop_duplicates(subset='CATEGORY')['CATEGORY']

In [301]:
a = np.mean([w2v_model.get_vector("toy"), w2v_model.get_vector("stores")], axis=0)
b = w2v_model.get_vector(w2v_model.index2entity[w2v_vocab.index("toy_store")])
cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))

array([[0.54730767]], dtype=float32)

In [384]:
def get_poss(start, i):
    return [word for word in w2v_vocab if word.startswith(start) and i in word]

pairs = [
    ["host", "prov"]

        ]
for pair in pairs:
    print(get_poss(pair[0], pair[1]))

['hosting_provider', 'hosting_providers', 'hosting_service_providers', 'hostname_provider']


In [50]:
list(map(np.round, alignment, [3]*len(alignment)))

[-0.076,
 0.068,
 -0.004,
 0.091,
 -0.027,
 0.07,
 0.058,
 0.017,
 0.05,
 0.036,
 0.073,
 -0.031,
 -0.042,
 0.183,
 -0.019,
 0.074,
 0.068,
 -0.007,
 -0.095,
 0.031,
 0.015,
 0.031,
 0.09,
 -0.013,
 -0.06,
 -0.077,
 0.201,
 -0.079,
 0.131,
 -0.071,
 0.044,
 -0.1,
 -0.042,
 0.016,
 0.054,
 -0.118,
 0.012,
 0.287,
 0.017,
 -0.074,
 -0.097,
 0.032,
 0.026,
 -0.016,
 0.062,
 0.099,
 -0.037,
 -0.012,
 0.051,
 0.023,
 0.173,
 -0.053,
 0.016,
 0.033,
 -0.009,
 0.076,
 -0.021,
 0.029,
 0.076,
 0.049,
 0.132,
 -0.071,
 0.066,
 0.094,
 0.015,
 0.129,
 0.114,
 0.066,
 0.028,
 0.012,
 0.085,
 0.114,
 -0.011,
 -0.049,
 0.111,
 0.013,
 0.082,
 0.026,
 0.019,
 0.135,
 0.065,
 0.085,
 -0.005,
 0.024,
 -0.031,
 0.089,
 -0.057,
 -0.019,
 -0.014,
 0.108,
 0.088,
 0.035,
 0.08,
 0.046,
 0.191,
 0.146,
 -0.052,
 -0.074,
 0.263,
 0.025,
 0.089,
 0.272,
 0.095,
 -0.047,
 -0.059,
 0.083,
 -0.014,
 -0.043,
 0.157,
 0.073,
 0.314,
 0.037,
 0.044,
 0.147,
 0.046,
 0.062,
 0.133,
 0.002,
 0.129,
 -0.002,
 0.065,
