In [None]:
#Use t-test to determine attributes that are significantly different between men and women

In [None]:
from scipy.stats import ttest_ind
import pandas as pd
import math

In [None]:
# Input formats
# 
# GROUND_TRUTH_FILE is assumed to have ground truth gender data for each user
# Specifically, this code looks for the columns 'Sex' (user gender), 'id' (user id)
#
# FEATURE_FILE is assumed the have various features for each user
# Specifically, this code looks for the column 'userid' (user id)

In [None]:
#Change these variables for your setup
home = 'data2016/'
FEATURE_FILE = home+"text_features/all_captions_unigrams_normalized.csv" #file with features to analyze
features = allFeatures.columns.values[:-1] #which features should we analyze?
GROUND_TRUTH_FILE = home+"data_analytics_cleaned.csv" #ground truth data (men and women annotated)

In [None]:
allFeatures = pd.read_csv(FEATURE_FILE)
allFeatures = allFeatures.drop(['Unnamed: 0'],axis=1)

In [None]:
#If necessary, average features together
allFeatures = allFeatures.drop('imageNum',axis=1) #Average features together
allFeatures = allFeatures.groupby('id').mean()
allFeatures = allFeatures.reset_index()

In [None]:
groundTruth = pd.read_csv(GROUND_TRUTH_FILE)
groundTruth = groundTruth.drop("Unnamed: 0",axis=1)

In [None]:
merged = allFeatures.merge(groundTruth[['id','Sex']],left_on='userid',right_on='id')
merged = merged.drop('userid',axis=1)

In [None]:
statistics = []
pValues = []
for feature in features:
    cleaned = merged[['Sex',feature]].dropna()
    male = cleaned.loc[cleaned.Sex==1][feature]
    female = cleaned.loc[cleaned.Sex==2][feature]
    stat, p = ttest_ind(male,female,equal_var=False)
    statistics.append(stat)
    pValues.append(p)

In [None]:
#Effect sizes formatted for LaTeX (not sorted)
effect_sizes = []
feature_names = []

for i in range(len(features)):
    if pValues[i] < .05:
        #print(features[i],pValues[i])
        male = merged.loc[merged.Sex==1][features[i]]
        female = merged.loc[merged.Sex==2][features[i]]
        effect_size = (female.mean()-male.mean())/math.sqrt((female.std()*female.std()+male.std()*male.std())/2.0)
        
        feature_names.append(features[i])
        effect_sizes.append(effect_size)
        
        print(features[i] + ' & ' + str(effect_size) +  '\\\\')

In [None]:
#Sorted effect sizes
effect_sizes_mag = [abs(i) for i in effect_sizes]
indices = sorted(range(len(effect_sizes_mag)), key=lambda i: effect_sizes_mag[i])
for i in indices[-13:]: #Only look at top 13 highest effect sizes
    print(feature_names[i],effect_sizes[i])