In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
#sys.stdout = open('post-hoc.txt', 'w')
k = 20

In [3]:
df_all_agreed = pd.read_csv('../Brand-Name-Gender-Prediction/all_agreed.csv')
df_all_agreed.drop(labels=['Old Index'], axis=1, inplace=True)
df_all_agreed = df_all_agreed.reset_index()

In [4]:
df_majority = pd.read_csv('../Brand-Name-Gender-Prediction/majority_brand_predictions.csv')
df_majority.drop(labels=['Old Index'], axis=1, inplace=True)
df_majority = df_majority.reset_index()


In [6]:
df_weat = pd.read_csv('unnormalized_weat.csv')
df_weat.drop(labels=['Old Index'], axis=1, inplace=True, errors='ignore')
df_weat = df_weat.reset_index()

This below section is for exploring data in the WEAT table, and showing how many brands are in each category in the WEAT model.

In [7]:
brands_weat = df_weat['BRAND'].tolist()
print('Number of brands in WEAT: {}'.format(len(brands_weat)))
print('Number of categories in WEAT: {}'.format(df_weat['CATEGORY'].nunique()))
print('Brands per category:')
count = df_weat.groupby('CATEGORY').count()
print(count['BRAND'].to_string())
print('\n')

Number of brands in WEAT: 1610
Number of categories in WEAT: 72
Brands per category:
CATEGORY
A141 Sportswear & Athleticwear                                                     107
A311 Womens Shoes & Boots                                                           49
A321 Sneakers                                                                       34
A411 Handbags                                                                       39
A430 Sunglasses                                                                     31
B111 Credit Cards                                                                   28
B122 Investment Products & Services (Excluding Retirement)                          82
B124 Loan/Credit Products & Services (Consumer)                                     43
B225 Internet Service Providers (ISP)                                               53
B241 Cable TV Providers & Systems                                                   15
B322 Religious, Charitable & Humanit

In [8]:
print('Getting rid of categories with fewer than 15 entries, we are left with:')
categories_trimmed_weat = count[count['BRAND'] >= 15].index.tolist()
df_weat_trimmed = df_weat[df_weat['CATEGORY'].isin(categories_trimmed_weat)]
brands_weat_trimmed = df_weat_trimmed['BRAND'].tolist()
print('Number of brands in Trimmed WEAT: {}'.format(len(brands_weat_trimmed)))
print('Number of categories in Trimmed WEAT: {}'.format(len(categories_trimmed_weat)))
print('Brands per category:')
count_trimmed = df_weat_trimmed.groupby('CATEGORY').count()
print(count_trimmed['BRAND'].to_string())
pd.DataFrame(count_trimmed)['BRAND'].to_csv("trimmed_weat.csv")
print('\n')

Getting rid of categories with fewer than 15 entries, we are left with:
Number of brands in Trimmed WEAT: 1328
Number of categories in Trimmed WEAT: 32
Brands per category:
CATEGORY
A141 Sportswear & Athleticwear                                                     107
A311 Womens Shoes & Boots                                                           49
A321 Sneakers                                                                       34
A411 Handbags                                                                       39
A430 Sunglasses                                                                     31
B111 Credit Cards                                                                   28
B122 Investment Products & Services (Excluding Retirement)                          82
B124 Loan/Credit Products & Services (Consumer)                                     43
B225 Internet Service Providers (ISP)                                               53
B241 Cable TV Providers & Systems  

In [9]:
df_all_agreed = df_all_agreed[df_all_agreed['Name'].isin(brands_weat)]
brands_all_and_weat = df_all_agreed['Name'].tolist()
print('Number of brands in All Agreed and WEAT: {}'.format(len(brands_all_and_weat)))
print('Number of categories in All Agreed and WEAT: {}'.format(df_all_agreed['Category'].nunique()))
print('Brands per category:')
count = df_all_agreed.groupby('Category').count()
print(count['Name'].to_string())
print('\n')

Number of brands in All Agreed and WEAT: 1109
Number of categories in All Agreed and WEAT: 72
Brands per category:
Category
A141 Sportswear & Athleticwear                                                     81
A311 Womens Shoes & Boots                                                          30
A321 Sneakers                                                                      23
A411 Handbags                                                                      26
A430 Sunglasses                                                                    24
B111 Credit Cards                                                                  18
B122 Investment Products & Services (Excluding Retirement)                         54
B124 Loan/Credit Products & Services (Consumer)                                    33
B225 Internet Service Providers (ISP)                                              34
B241 Cable TV Providers & Systems                                                  13
B322 Religious, 

In [10]:
print('Getting rid of categories with fewer than 15 entries, we are left with:')
categories_trimmed_aaw = count[count['Name'] >= 15].index.tolist()
df_all_trimmed = df_all_agreed[df_all_agreed['Category'].isin(categories_trimmed_aaw)]
brands_aaw_trimmed = df_all_trimmed['Name'].tolist()
print('Number of brands in Trimmed All Agreed and WEAT: {}'.format(len(brands_aaw_trimmed)))
print('Number of categories in Trimmed All Agreed and WEAT: {}'.format(len(categories_trimmed_aaw)))
print('Brands per category:')
count_trimmed = df_all_trimmed.groupby('Category').count()
print(count_trimmed['Name'].to_string())
pd.DataFrame(count_trimmed)['Name'].to_csv("trimmed_all_agree_weat.csv")
print('\n')

Getting rid of categories with fewer than 15 entries, we are left with:
Number of brands in Trimmed All Agreed and WEAT: 843
Number of categories in Trimmed All Agreed and WEAT: 26
Brands per category:
Category
A141 Sportswear & Athleticwear                                                     81
A311 Womens Shoes & Boots                                                          30
A321 Sneakers                                                                      23
A411 Handbags                                                                      26
A430 Sunglasses                                                                    24
B111 Credit Cards                                                                  18
B122 Investment Products & Services (Excluding Retirement)                         54
B124 Loan/Credit Products & Services (Consumer)                                    33
B225 Internet Service Providers (ISP)                                              34
B641 TV Network

In [11]:
df_majority = df_majority[df_majority['Name'].isin(brands_weat)]
brands_maj_and_weat = df_majority['Name'].tolist()
print('Number of brands in Majority Agreed and WEAT: {}'.format(len(brands_maj_and_weat)))
print('Number of categories in Majority Agreed and WEAT: {}'.format(df_majority['Category'].nunique()))
print('Brands per category:')
count = df_majority.groupby('Category').count()
print(count['Name'].to_string())
print('\n')

Number of brands in Majority Agreed and WEAT: 1610
Number of categories in Majority Agreed and WEAT: 72
Brands per category:
Category
A141 Sportswear & Athleticwear                                                     107
A311 Womens Shoes & Boots                                                           49
A321 Sneakers                                                                       34
A411 Handbags                                                                       39
A430 Sunglasses                                                                     31
B111 Credit Cards                                                                   28
B122 Investment Products & Services (Excluding Retirement)                          82
B124 Loan/Credit Products & Services (Consumer)                                     43
B225 Internet Service Providers (ISP)                                               53
B241 Cable TV Providers & Systems                                                  

In [12]:
print('Getting rid of categories with fewer than 15 entries, we are left with:')
categories_trimmed_maw = count[count['Name'] >= 15].index.tolist()
df_maj_trimmed = df_majority[df_majority['Category'].isin(categories_trimmed_maw)]
brands_maw_trimmed = df_maj_trimmed['Name'].tolist()
print('Number of brands in Trimmed Majority Agreed and WEAT: {}'.format(len(brands_maw_trimmed)))
print('Number of categories in Trimmed Majority Agreed and WEAT: {}'.format(len(categories_trimmed_maw)))
print('Brands per category:')
count_trimmed = df_maj_trimmed.groupby('Category').count()
print(count_trimmed['Name'].to_string())
pd.DataFrame(count_trimmed)['Name'].to_csv("trimmed_maj_agree_weat.csv")
print('\n')

Getting rid of categories with fewer than 15 entries, we are left with:
Number of brands in Trimmed Majority Agreed and WEAT: 1328
Number of categories in Trimmed Majority Agreed and WEAT: 32
Brands per category:
Category
A141 Sportswear & Athleticwear                                                     107
A311 Womens Shoes & Boots                                                           49
A321 Sneakers                                                                       34
A411 Handbags                                                                       39
A430 Sunglasses                                                                     31
B111 Credit Cards                                                                   28
B122 Investment Products & Services (Excluding Retirement)                          82
B124 Loan/Credit Products & Services (Consumer)                                     43
B225 Internet Service Providers (ISP)                                             

In [14]:
x = df_weat_trimmed.groupby('CATEGORY')['UNNORMALIZED WEAT GENDER'].value_counts()
y = df_all_trimmed.groupby('Category')['Gender'].value_counts()
z = df_maj_trimmed.groupby('Category')['Gender'].value_counts()

# pd.DataFrame(x).to_csv("trimmed_weat_overall_gender_dist.csv")
# pd.DataFrame(y).to_csv("trimmed_all_agree_weat_overall_gender_dist.csv")
# pd.DataFrame(z).to_csv("trimmed_maj_agree_weat_overall_gender_dist.csv")

In [18]:
df_weat_trimmed['UNNORMALIZED WEAT GENDER'].value_counts()

F    854
M    474
Name: UNNORMALIZED WEAT GENDER, dtype: int64

In [None]:
print("Gender Distribution of WEAT")
print(x.to_string())
print('\n')
print("Gender Distribution of All Agreed and WEAT")
print("For reference, 0 is male, and 1 is female")
print(y.to_string())
print('\n')
print("Gender Distribution of Majority Agreed and WEAT")
print(z.to_string())
print('\n')

In [None]:
def get_alignment(df):
    num_pairs = df.shape[0]//2
    for i in range(num_pairs):
        if df[df.index[2*i]] < df[df.index[2*i + 1]]:
            g = 'M' if df.index[2*i + 1][1] == 0 else 'F'
            print("{}: {}".format(df.index[2*i + 1][0], g))
        elif df[df.index[2*i]] > df[df.index[2*i + 1]]:
            g = 'M' if df.index[2*i][1] == 0 else 'F'
            print("{}: {}".format(df.index[2*i][0], g))
        else:
            print("{}: {}".format(df.index[2*i][0], 'E'))
            
def get_alignment_weat(df):
    num_pairs = df.shape[0]//2
    for i in range(num_pairs):
        if df[df.index[2*i]] < df[df.index[2*i + 1]]:
            print("{}: {}".format(df.index[2*i + 1][0], df.index[2*i + 1][1]))
        elif df[df.index[2*i]] > df[df.index[2*i + 1]]:
            print("{}: {}".format(df.index[2*i][0], df.index[2*i][1]))
        else:
            print("{}: {}".format(df.index[2*i][0], 'E'))

In [None]:
print("Dominant Gender of WEAT")
get_alignment_weat(x)
print('\n')
print("Dominant Gender of All Agreed")
get_alignment(y)
print('\n')
print("Dominant Gender of Majority Agreed")
get_alignment(z)

In [None]:
print("WEAT Total Gender Distribution For Trimmed Categories")
print(df_weat_trimmed['UNNORMALIZED WEAT GENDER'].value_counts())
print('\n')
print("Sound Symbolism - All Agreed Total Gender Distribution For Trimmed Categories")
print(df_all_trimmed['Gender'].value_counts())
print('\n')
print("Sound Symbolism - Majority Agreed Total Gender Distribution For Trimmed Categories")
print("For reference, 0 is male, and 1 is female")
print(df_maj_trimmed['Gender'].value_counts())
print('\n')

The following commented out cells were for finding most neutral and extreme brands in overlapping cases. Rewriting Below to do so for each model separately.

In [None]:
# print("\nTwenty most gender neutral brands in WEAT based on Logistic Regression")
# k_most_weat = df_weat_trimmed.assign(abs_v = abs(df_weat["UNNORMALIZED WEAT"])).sort_values('abs_v').drop('abs_v', axis=1)
# print("See k_most_neutral_weat.csv")
# pd.DataFrame(k_most_weat.iloc[:k]).to_csv("k_most_neutral_weat.csv")

# df_human = pd.read_csv('../Brand-Name-Gender-Prediction/human_names_coded.csv')
# df_human = df_human.drop(["dataID"], axis=1)
# df_human = df_human.drop(["Name"], axis=1)
# x = df_human.drop(['Gender', 'is_unique'], axis=1).as_matrix()[:, 1:]
# y = df_human[['Gender']].as_matrix().ravel()
# model_LR = LogisticRegression()
# model_LR.fit(x, y)



In [None]:
# df_brands = pd.read_csv('../Brand-Name-Gender-Prediction/brand_names_coded.csv')
# df_brands_aaw = df_brands[df_brands['BRAND'].isin(brands_aaw_trimmed)]
# brands_prob_aaw = []
# for i in range(len(df_brands_aaw.index)):
#     feature_vec = df_brands_aaw.iloc[i][3:].as_matrix().ravel().reshape((1, -1))
#     x = model_LR.predict_proba(feature_vec)
#     brands_prob_aaw.append((df_brands_aaw.iloc[i][1], (abs(x[0][0] - x[0][1])), "M" if x[0][0] > x[0][1] else "F"))
# brands_prob_aaw.sort(key=lambda x: x[1])
# brands_prob_aaw = np.asarray(brands_prob_aaw)
# print("\nTwenty most gender neutral brands in WEAT and all agreed based on Logistic Regression")
# print("Negative indicates more female, positive indicates more male")
# print("See k_most_neutral_all_and_weat.csv")
# k_most_all = pd.DataFrame(data={'BRAND': brands_prob_aaw[:k, 0], 'Neutrality Score': brands_prob_aaw[:k, 1], "Alignment":brands_prob_aaw[:k, 2] })
# pd.DataFrame(k_most_all).to_csv("k_most_neutral_all_and_weat.csv")


In [None]:
# df_brands_maw = df_brands[df_brands['BRAND'].isin(brands_maw_trimmed)]
# brands_prob_maw = []
# for i in range(len(df_brands_maw.index)):
#     feature_vec = df_brands_maw.iloc[i][3:].as_matrix().ravel().reshape((1, -1))
#     x = model_LR.predict_proba(feature_vec)
#     brands_prob_maw.append((df_brands_maw.iloc[i][1], (abs(x[0][0] - x[0][1])), "M" if x[0][0] > x[0][1] else "F"))
# brands_prob_maw.sort(key=lambda x: x[1])
# brands_prob_maw = np.asarray(brands_prob_maw)
# print("\nTwenty most gender neutral brands in WEAT and majority agreed based on Logistic Regression")
# print("Negative indicates more female, positive indicates more male")
# k_most_maj = pd.DataFrame(data={'BRAND': brands_prob_maw[:k, 0], 'Neutrality Score': brands_prob_maw[:k, 1], "Alignment":brands_prob_maw[:k, 2] })
# print("See k_most_neutral_maj_and_weat.csv")
# pd.DataFrame(k_most_maj).to_csv("k_most_neutral_maj_and_weat.csv")

In [None]:
# pd.DataFrame(k_most_weat.iloc[-1:-(k//2 + 2):-1]).to_csv("k_most_extreme_weat.csv")

In [None]:
# flipped_aaw = np.flip(brands_prob_aaw, axis=0)
# flipped_maw = np.flip(brands_prob_maw, axis=0)
# k_extreme_aaw = pd.DataFrame(data={'BRAND': flipped_aaw[:k//2, 0], 'Neutrality Score': flipped_aaw[:k//2, 1], "Alignment":flipped_aaw[:k//2, 2] })
# k_extreme_maw = pd.DataFrame(data={'BRAND': flipped_maw[:k//2, 0], 'Neutrality Score': flipped_maw[:k//2, 1], "Alignment":flipped_maw[:k//2, 2] })
# pd.DataFrame(k_extreme_aaw).to_csv("k_most_extreme_all_and_weat.csv")
# pd.DataFrame(k_extreme_maw).to_csv("k_most_extreme_maj_and_weat.csv")

In [None]:
df_majority = df_majority.reset_index().drop(['index', 'level_0'], axis=1, errors='ignore')
df_majority = df_majority.replace({'Gender': {0: 'M', 1:'F'}}) 
df_weat = df_weat.sort_values('BRAND')
df_majority = df_majority.sort_values('Name')
df_majority = df_majority.reset_index().drop(['index'], axis=1, errors='ignore')
df_weat = df_weat.reset_index().drop(['index', 'level_0'], axis=1, errors='ignore')
df_comp = pd.DataFrame()
df_comp['BRAND'] = df_weat['BRAND']
df_comp['CATEGORY'] = df_weat['CATEGORY']
df_comp['MATCH'] = df_weat['UNNORMALIZED WEAT GENDER'].eq(df_majority['Gender'])

In [None]:
df_comp['Gender'] = df_weat['UNNORMALIZED WEAT GENDER']
df_comp['GENDER'] = df_comp.apply(lambda row: row['Gender'] if row['MATCH'] == True else 'N', axis=1)
df_comp = df_comp.drop(['Gender'], axis=1)
pd.DataFrame(df_comp).to_csv('gender_prediction_match.csv')
print("\nSee gender_prediction_match.csv for how the majority agreed sound model and WEAT model matched on gender prediction. ")
match_count = df_comp['MATCH'].sum()
print("We see that {} out of {} match, or {}".format(match_count, df_comp.shape[0], match_count/df_comp.shape[0]))

In [None]:
print("\nTwenty most gender neutral brands in WEAT")
k_neutral_weat = df_weat.assign(abs_v = abs(df_weat["UNNORMALIZED WEAT"])).sort_values('abs_v').drop('abs_v', axis=1)
print("See k_most_neutral_weat.csv")
pd.DataFrame(k_neutral_weat.iloc[:k]).to_csv("k_most_neutral_weat.csv")
print("See k_most_extreme_male_weat.csv")
print("See k_most_extreme_female_weat.csv")
k_extreme_weat = df_weat.assign(abs_v = abs(df_weat["UNNORMALIZED WEAT"])).sort_values('abs_v', ascending=False).drop('abs_v', axis=1)
k_most_extreme_male = k_extreme_weat[k_extreme_weat['UNNORMALIZED WEAT GENDER'] == 'M']
k_most_extreme_female = k_extreme_weat[k_extreme_weat['UNNORMALIZED WEAT GENDER'] == 'F']
pd.DataFrame(k_most_extreme_male.iloc[:k]).to_csv("k_most_extreme_male_weat.csv")
pd.DataFrame(k_most_extreme_female.iloc[:k]).to_csv("k_most_extreme_female_weat.csv")


In [None]:
df_human = pd.read_csv('../Brand-Name-Gender-Prediction/human_names_coded.csv')
df_human = df_human.drop(["dataID"], axis=1)
df_human = df_human.drop(["Name"], axis=1)
x = df_human.drop(['Gender', 'is_unique'], axis=1).as_matrix()[:, 1:]
y = df_human[['Gender']].as_matrix().ravel()
model_LR = LogisticRegression()
model_LR.fit(x, y)
df_brands = pd.read_csv('../Brand-Name-Gender-Prediction/brand_names_coded.csv')

In [None]:
df_majority = pd.read_csv('../Brand-Name-Gender-Prediction/majority_brand_predictions.csv')
df_majority = df_majority.reset_index()
df_majority.drop(labels=['Old Index', 'index'], axis=1, inplace=True)
df_brands_maw = df_brands[df_brands['BRAND'].isin(df_majority['Name'].tolist())]
brands_prob_maw = []
for i in range(len(df_brands_maw.index)):
    feature_vec = df_brands_maw.iloc[i][3:].as_matrix().ravel().reshape((1, -1))
    x = model_LR.predict_proba(feature_vec)
    brands_prob_maw.append((df_brands_maw.iloc[i][1], df_brands_maw.iloc[i][2], (abs(x[0][0] - x[0][1])), "M" if x[0][0] > x[0][1] else "F"))
brands_prob_maw.sort(key=lambda x: x[2])
brands_prob_maw = np.asarray(brands_prob_maw)
print("\nTwenty most gender neutral brands in WEAT and majority agreed based on Logistic Regression")
print("Negative indicates more female, positive indicates more male")
k_neutral_maj = pd.DataFrame(data={'BRAND': brands_prob_maw[:, 0], 'CATEGORY': brands_prob_maw[:, 1], 'Neutrality Score': brands_prob_maw[:, 2], "Alignment":brands_prob_maw[:, 3] })
print(k_neutral_maj)


In [None]:
print("See k_most_neutral_maj.csv")
pd.DataFrame(k_neutral_maj.iloc[:k]).to_csv("k_most_neutral_maj.csv")

print("See k_most_extreme_male_maj.csv")
print("See k_most_extreme_female_maj.csv")

k_extreme_maj = k_neutral_maj.sort_values('Neutrality Score', ascending=False)
k_extreme_maj_male = k_extreme_maj[k_extreme_maj['Alignment'] == 'M']
k_extreme_maj_female = k_extreme_maj[k_extreme_maj['Alignment'] == 'F']
print(k_extreme_maj_male)


pd.DataFrame(k_extreme_maj_male.iloc[:k]).to_csv("k_most_extreme_male_maj.csv")
pd.DataFrame(k_extreme_maj_female.iloc[:k]).to_csv("k_most_extreme_female_maj.csv")

In [None]:
k_extreme_maj[k_extreme_maj['BRAND'].isin(['nina', 'celine', 'buffy'])]