In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/sample_manual_tags.csv', index_col=0)
df_all = pd.read_csv('../data/sample_df_gender_all.csv', index_col=0)
df.head()

In [3]:
df = pd.merge(df, df_all, how='left')

In [None]:
df.head()

In [5]:
df['GENDER_MANUAL'] = df['GENDER_MANUAL'].apply(lambda x: 'male' if x=='m' else 'female')

In [None]:
df.head()

In [7]:
df.gender.value_counts()

female    186
male       91
Name: gender, dtype: int64

In [8]:
df.GENDER_MANUAL.value_counts()

female    238
male      121
Name: GENDER_MANUAL, dtype: int64

## Precision for API 1 (genderize.io)

In [33]:
from sklearn.metrics import classification_report, confusion_matrix
df.gender.fillna('unknown', inplace=True)
print(classification_report(df['GENDER_MANUAL'], df['gender']))

              precision    recall  f1-score   support

      female       0.98      0.76      0.86       238
        male       0.90      0.68      0.77       121
     unknown       0.00      0.00      0.00         0

    accuracy                           0.74       359
   macro avg       0.63      0.48      0.54       359
weighted avg       0.95      0.74      0.83       359



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
print(confusion_matrix(df['GENDER_MANUAL'], df['gender'], labels=["unknown", "male", "female"]))

[[  0   0   0]
 [ 35  82   4]
 [ 47   9 182]]


## Precision for API 2 (gender-api)

In [12]:
print(classification_report(df['GENDER_MANUAL'], df['gender_2']))

              precision    recall  f1-score   support

      female       0.99      0.86      0.92       238
        male       0.88      0.92      0.90       121
     unknown       0.00      0.00      0.00         0

    accuracy                           0.88       359
   macro avg       0.62      0.59      0.61       359
weighted avg       0.95      0.88      0.91       359



In [13]:
print(confusion_matrix(df['GENDER_MANUAL'], df['gender_2'], labels=["unknown", "male", "female"]))

[[  0   0   0]
 [  7 111   3]
 [ 18  15 205]]


## Precision for images (deepface)

In [14]:
df.gender_img.fillna('NONE', inplace=True)
print(classification_report(df['GENDER_MANUAL'], df['gender_img']))

              precision    recall  f1-score   support

        NONE       0.00      0.00      0.00         0
      female       0.68      0.44      0.53       238
        male       0.35      0.60      0.44       121

    accuracy                           0.49       359
   macro avg       0.34      0.34      0.32       359
weighted avg       0.57      0.49      0.50       359



In [15]:
print(confusion_matrix(df['GENDER_MANUAL'], df['gender_img'], labels=["NONE", "male", "female"]))

[[  0   0   0]
 [  0  72  49]
 [  1 133 104]]


## Analyzing probabilities

In [16]:
df.probability.mean()

0.7305849582172701

In [17]:
df.probability_2.mean()

88.62116991643454

In [18]:
df.gender.value_counts()

female     186
male        91
unknown     82
Name: gender, dtype: int64

In [19]:
df.gender_2.value_counts()

female     208
male       126
unknown     25
Name: gender_2, dtype: int64

### Analyzing precision for low probabilities API 1

In [20]:
df_low_prob = df[(df['probability']<0.7) | (df['count']<30)]

In [None]:
df_low_prob

In [22]:
print(classification_report(df_low_prob['GENDER_MANUAL'], df_low_prob['gender']))

              precision    recall  f1-score   support

      female       0.93      0.49      0.64       107
        male       0.76      0.40      0.53        65
     unknown       0.00      0.00      0.00         0

    accuracy                           0.45       172
   macro avg       0.56      0.30      0.39       172
weighted avg       0.87      0.45      0.60       172



  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
print(classification_report(df_low_prob['GENDER_MANUAL'], df_low_prob['gender_img']))

              precision    recall  f1-score   support

      female       0.63      0.43      0.51       107
        male       0.38      0.58      0.46        65

    accuracy                           0.49       172
   macro avg       0.51      0.51      0.49       172
weighted avg       0.54      0.49      0.49       172



### Analyzing precision for low probabilities API 2

In [24]:
df_low_prob = df[(df['probability_2']<70) | (df['count_2']<30)]

In [None]:
df_low_prob

In [26]:
print(classification_report(df_low_prob['GENDER_MANUAL'], df_low_prob['gender_2']))

              precision    recall  f1-score   support

      female       0.96      0.65      0.78        81
        male       0.74      0.76      0.75        38
     unknown       0.00      0.00      0.00         0

    accuracy                           0.69       119
   macro avg       0.57      0.47      0.51       119
weighted avg       0.89      0.69      0.77       119



  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
print(classification_report(df_low_prob['GENDER_MANUAL'], df_low_prob['gender_img']))

              precision    recall  f1-score   support

      female       0.65      0.35      0.45        81
        male       0.30      0.61      0.40        38

    accuracy                           0.43       119
   macro avg       0.48      0.48      0.43       119
weighted avg       0.54      0.43      0.44       119



# Changing precision for API 2

In [None]:
df.head()

In [21]:
males = df[df['gender_2']=='male']

In [None]:
males.head(50)

In [23]:
# Create unified probability. Assume probability of the other gender is 1-prob
df['probability_male'] = df.apply(lambda x: x['probability_2'] if x['gender_2']=='male' else 100-x['probability_2'], axis=1)
df['probability_female'] = df.apply(lambda x: x['probability_2'] if x['gender_2']=='female' else 100-x['probability_2'], axis=1)

In [None]:
df.head()

In [46]:
def define_new_gender(x):
    if x['gender_2'] == 'unknown':
        return 'unknown'
    elif x['probability_male'] > 80:
        return 'male'
    else:
        return 'female'

In [47]:
df['new_gender'] = df.apply(lambda x: define_new_gender(x), axis=1)

In [None]:
df.head()

In [49]:
print(classification_report(df['GENDER_MANUAL'], df['gender_2']))

              precision    recall  f1-score   support

      female       0.99      0.86      0.92       238
        male       0.88      0.92      0.90       121
     unknown       0.00      0.00      0.00         0

    accuracy                           0.88       359
   macro avg       0.62      0.59      0.61       359
weighted avg       0.95      0.88      0.91       359



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
print(classification_report(df['GENDER_MANUAL'], df['new_gender']))

              precision    recall  f1-score   support

      female       0.95      0.91      0.93       238
        male       0.96      0.85      0.90       121
     unknown       0.00      0.00      0.00         0

    accuracy                           0.89       359
   macro avg       0.64      0.59      0.61       359
weighted avg       0.96      0.89      0.92       359



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
