In [1]:
import gensim.models.keyedvectors as word2vec
from gensim.scripts.glove2word2vec import glove2word2vec

import time
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import os
import matplotlib.pyplot as plt
from pylab import *

In [2]:
#glove2word2vec(glove_input_file="glove.840B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")
model = word2vec.KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)


In [3]:
model_vocab = [key.lower() for key in model.vocab.keys()]
vex = {}

In [4]:
def get_simple_alignment(brand, gender_subspace):
    try:
        word_vec = model.get_vector(brand.capitalize())
    except KeyError:
        
        index = model_vocab.index(brand)
        word_vec = model.get_vector(model.index2entity[index])
    word_vec = normalize(word_vec.reshape(1, -1))
    gen_comp = np.vdot(word_vec, gender_subspace)
#     if gen_comp < 0:
#         return 'M'
#     else:
#         return 'F'
    return gen_comp

In [5]:
female = []
male = []
with open('pairs_paper.txt', 'r') as file:
    data = file.readlines()
    data = [line.strip() for line in data]
    for line in data:
        sub = line.split(', ')
        fem_vec = model.get_vector(sub[0])
        male_vec = model.get_vector(sub[1])
        female.append(fem_vec)
        male.append(male_vec)
        vec = np.subtract(fem_vec, male_vec)
        vex[line] = vec


In [6]:
she_vec = normalize(female[0].reshape(1, -1))
he_vec = normalize(male[0].reshape(1, -1))
he_she_axis = np.subtract(she_vec, he_vec)

In [7]:
df_brands = pd.read_csv('brand_names_coded_no_duplicates.csv')
df_brands = df_brands[df_brands['BRAND'].isin(model_vocab)]
print(df_brands.shape)
df_brands['ALIGNMENT'] = df_brands['BRAND'].apply(lambda x: get_simple_alignment(x, he_she_axis))
df_brands = df_brands.reset_index()
df_brands = df_brands[['BRAND', 'CATEGORY', 'ALIGNMENT']]
df_brands.sort_values(by='ALIGNMENT').to_csv('glove_he_she_alignment.csv')

(1644, 20)


In [8]:
df_brands['gender'] = df_brands['ALIGNMENT'].apply(lambda x: 'M' if x < 0 else 'F')
df_brands['gender'].value_counts()
df_brands.drop_duplicates(subset='BRAND').groupby('CATEGORY')['gender'].value_counts().to_csv('glove_he_she_category_value_counts.csv')


In [9]:
#Filter out brands that are names
df_k = pd.read_csv('check_kantrowitz.csv')
df_ip = pd.read_csv('check_ipums.csv')
df_napp = pd.read_csv('check_napp.csv')
df_ssa = pd.read_csv('check_ssa.csv')

df_k = df_k.transpose()
k = df_k[df_k[1].isin(['male', 'female', 'either'])][0].tolist()
ip = df_ip.loc[0].tolist()[1:]
napp = df_napp.loc[0].tolist()[1:]
ssa = df_ssa.loc[0].tolist()[1:]

In [10]:
all_gendered_brands = list(set(k).union(set(ip)).union(set(napp)).union(set(ssa)))
df_no_names = df_brands[~df_brands['BRAND'].isin(all_gendered_brands)]
df_no_names['gender'] = df_no_names['ALIGNMENT'].apply(lambda x: 'M' if x < 0 else 'F')
df_no_names.drop_duplicates(subset='BRAND').sort_values(by='ALIGNMENT').to_csv('glove_he_she_alignment_just_brands_no_names.csv')
df_no_names.drop_duplicates(subset='BRAND').groupby('CATEGORY')['gender'].value_counts().to_csv('glove_he_she_category_value_counts_no_names.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(2458, 19)
