In [1]:
import gensim.models.keyedvectors as word2vec
import time
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import os
import matplotlib.pyplot as plt
from pylab import *

In [None]:
model = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model_vocab = [key.lower() for key in model.vocab.keys()]

In [None]:
def do_PCA(female, male):
    matrix = []
    for f, m in zip(female, male):
        center = (f + m) / 2
        matrix.append(f - center)
        matrix.append(m - center)
    matrix = np.array(matrix)
    pca = PCA(n_components=10)
    principal_components = pca.fit(matrix)
    print("No transform explained variance ration ", pca.explained_variance_ratio_)
    with open('principal_components_1.txt', "w") as file:
        file.write("Explained variance ratio: {}\n".format(pca.explained_variance_ratio_))
        for component in principal_components.components_:
            file.write(str(component) + "\n")
    return principal_components.components_[0]


In [None]:
def get_simple_alignment(brand, gender_subspace):
    try:
        word_vec = model.get_vector(brand.capitalize())
    except KeyError:
        
        index = model_vocab.index(brand)
        word_vec = model.get_vector(model.index2entity[index])
    word_vec = normalize(word_vec.reshape(1, -1))
    gen_comp = np.vdot(word_vec, gender_subspace)
#     if gen_comp < 0:
#         return 'M'
#     else:
#         return 'F'
    return gen_comp

In [None]:
female = []
male = []
with open('pairs_paper.txt', 'r') as file:
    data = file.readlines()
    data = [line.strip() for line in data]
    for line in data:
        sub = line.split(', ')
        fem_vec = model.get_vector(sub[0])
        male_vec = model.get_vector(sub[1])
        female.append(fem_vec)
        male.append(male_vec)
        vec = np.subtract(fem_vec, male_vec)
        vex[line] = vec


In [None]:
# gender_subspace = do_PCA(female, male).reshape(1, -1)
# print(gender_subspace.shape)

she_vec = normalize(female[0].reshape(1, -1))
he_vec = normalize(male[0].reshape(1, -1))
he_she_axis = np.subtract(she_vec, he_vec)


In [None]:
with open('short_paper.txt', 'r') as file:
    short_paper = file.readlines()
    short_paper = [line.strip() for line in short_paper]

with open('long_paper.txt', 'r') as file:
    long_paper = file.readlines()
    long_paper = [line.strip() for line in long_paper]

In [None]:
short_df = pd.DataFrame()
short_df['words'] = short_paper
#short_df['alignment_gs'] = short_df['words'].apply(lambda x: get_gender_alignment(x, she_norm, he_norm, gender_subspace))
#short_df['simple_gs'] = short_df['words'].apply(lambda x: get_simple_alignment(x, gender_subspace))
#short_df['alignment_hsa'] = short_df['words'].apply(lambda x: get_gender_alignment(x, she_norm, he_norm, he_she_axis))
short_df['simple_hsa'] = short_df['words'].apply(lambda x: get_simple_alignment(x, he_she_axis))
short_df['category'] = 'BASELINE'

In [None]:
long_df = pd.DataFrame()
long_df['words'] = long_paper
#long_df['alignment_gs'] = long_df['words'].apply(lambda x: get_gender_alignment(x, she_norm, he_norm, gender_subspace))
#long_df['simple_gs'] = long_df['words'].apply(lambda x: get_simple_alignment(x, gender_subspace))
#long_df['alignment_hsa'] = long_df['words'].apply(lambda x: get_gender_alignment(x, she_norm, he_norm, he_she_axis))
long_df['simple_hsa'] = long_df['words'].apply(lambda x: get_simple_alignment(x, he_she_axis))
long_df['category'] = 'BASELINE'

In [None]:
# short_hs_order = short_df.sort_values(by=['simple_hsa'])['words'].tolist()
# short_gs_order = short_df.sort_values(by=['simple_gs'])['words'].tolist()
long_hs_order = long_df.sort_values(by=['simple_hsa'])['words'].tolist()
# long_gs_order = long_df.sort_values(by=['simple_gs'])['words'].tolist()

In [None]:
long_df.sort_values(by=['simple_hsa'])

In [None]:
df_brands = pd.read_csv('brand_names_coded_no_duplicates.csv')
df_brands = df_brands[df_brands['BRAND'].isin(model_vocab)]
print(df_brands.shape)
df_brands['ALIGNMENT'] = df_brands['BRAND'].apply(lambda x: get_simple_alignment(x, he_she_axis))
# pd.DataFrame(df).to_csv('gender_projection_results.csv')
df_brands = df_brands.reset_index()
df_brands = df_brands[['BRAND', 'CATEGORY', 'ALIGNMENT']]

In [None]:
df_final = pd.DataFrame()
df_final['name'] = pd.concat([long_df['words'], df_brands['BRAND']])
df_final['category'] = pd.concat([long_df['category'], df_brands['CATEGORY']])
df_final['alignment'] = pd.concat([long_df['simple_hsa'], df_brands['ALIGNMENT']])
df_final.sort_values(by='alignment').to_csv('he_she_alignment.csv')

In [None]:
df_final.sort_values(by=['alignment'])
keep = df_brands['BRAND'].tolist() + ['he', 'she']
df_final = df_final[df_final['name'].isin(keep)].sort_values(by='alignment')
df_final
df_final.sort_values(by='alignment').to_csv('he_she_alignment_just_brands.csv')

In [None]:
df_final['gender'] = df_final['alignment'].apply(lambda x: 'M' if x < 0 else 'F')
df_final['gender'].value_counts()
df_final.drop_duplicates(subset='name').groupby('category')['gender'].value_counts().to_csv('he_she_category_value_counts.csv')


In [None]:
# df_plot = pd.read_csv('he_she_alignment_just_brands.csv')
# df_plot = df_plot[df_plot['keep'] == 1]
# df_plot['jitter'] = df_plot['keep'].apply(lambda x: np.random.randint(-20, 20))

In [None]:
# rcParams['font.family'] = 'sans-serif'
# fig = plt.figure(figsize=(15, 12))
# ax = fig.add_subplot(111)
# for index, row in df_plot.iterrows():
#     plt.scatter(row['alignment'], row['jitter'], c='black', s=1000, marker=r"$ {} $".format(row['name'])) 
# left,right = ax.get_xlim()
# low,high = ax.get_ylim()
# arrow( left, 0, right -left, 0, length_includes_head = False)
# arrow( 0, low, 0, high-low, length_includes_head = False) 


# plt.show()

In [None]:
#Filter out brands that are names
df_k = pd.read_csv('check_kantrowitz.csv')
df_ip = pd.read_csv('check_ipums.csv')
df_napp = pd.read_csv('check_napp.csv')
df_ssa = pd.read_csv('check_ssa.csv')

df_k = df_k.transpose()
k = df_k[df_k[1].isin(['male', 'female', 'either'])][0].tolist()
ip = df_ip.loc[0].tolist()[1:]
napp = df_napp.loc[0].tolist()[1:]
ssa = df_ssa.loc[0].tolist()[1:]

In [None]:
all_gendered_brands = list(set(k).union(set(ip)).union(set(napp)).union(set(ssa)))
df_no_names = df_final[~df_final['name'].isin(all_gendered_brands)]
df_no_names['gender'] = df_no_names['alignment'].apply(lambda x: 'M' if x < 0 else 'F')
df_no_names.drop_duplicates(subset='name').sort_values(by='alignment').to_csv('he_she_alignment_just_brands_no_names.csv')
df_no_names.drop_duplicates(subset='name').groupby('category')['gender'].value_counts().to_csv('he_she_category_value_counts_no_names.csv')

In [None]:
# F    1128
# M     485
# Name: gender, dtype: int64

# F    1025
# M     448
# Name: gender, dtype: int64
        
#F    1054
# M     556
# Name: UNNORMALIZED WEAT GENDER, dtype: int64

# F    854
# M    474
# Name: UNNORMALIZED WEAT GENDER, dtype: int64
df_plot