In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# This needs to be defined by the drop down in the UI
subgroup1 = "woman"
subgroup2 = "man"
subgroup3 = "non-binary"

In [3]:
data = load_dataset("c4", "en", split= "train", streaming = True)

In [None]:
grab_n = 10000
# For streaming data
print('Note: Just taking the first %s instances.' % grab_n)
data_head = data.take(grab_n)
#data_head = [["there is a woman with a hairbrush"],["there is a woman with a hairbrush"],["there is a woman with a hairbrush"],["there is a man with a dog"],["there is a man with a dog"]]
df = pd.DataFrame(data_head, columns=["text"])
# If not streaming, use:
#df = pd.json_normalize(data)

Note: Just taking the first 10000 instances.


In [None]:
def count_vocab_frequencies(df):
    """
    Based on an input pandas DataFrame with a 'text' column, 
    this function will count the occurrences of all words
    with a frequency higher than 'cutoff' and will return another DataFrame
    with the rows corresponding to the different vocabulary words
    and the column to the count count of that word.
    """
    # Move this up as a constant in larger code.
    batch_size = 10
    
    # We do this to calculate per-word statistics
    df['text'] = df['text'].str.lower()
    # Regex for pulling out single words
    cvec = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b", lowercase=True)
    
    # We also do this because we need to have the tokenization per sentence 
    # so that we can look at co-occurrences of words across sentences for nPMI calculation
    sent_tokenizer = cvec.build_tokenizer()
    df['tokenized'] = df.text.apply(sent_tokenizer)
    
    # Fast calculation of single word counts
    cvec.fit(df.text)
    document_matrix = cvec.transform(df.text)
    batches = np.linspace(0, df.shape[0], batch_size).astype(int)
    i = 0
    tf = []
    while i < len(batches) - 1:
        batch_result = np.sum(document_matrix[batches[i]:batches[i+1]].toarray(), axis=0)
        tf.append(batch_result)
        i += 1
    term_freq_df = pd.DataFrame([np.sum(tf, axis=0)], columns=cvec.get_feature_names()).transpose()
    
    # Now organize everything into the dataframes
    term_freq_df.columns = ['count']
    term_freq_df.index.name = 'word'
    sorted_term_freq_df = pd.DataFrame(term_freq_df.sort_values(by='count', ascending=False)['count'])
    return sorted_term_freq_df, df

In [None]:
term_df, df = count_vocab_frequencies(df)
# p(word).  Note that multiple occurrences of a word in a sentence increases its probability.
# We may want to do something about that.
term_df['proportion'] = term_df['count']/float(sum(term_df['count']))
# Sanity check
print(term_df.head())
print(term_df.tail())

In [None]:
def get_PMI(df_coo, subgroup):
    # PMI(x;y) = h(y) - h(y|x)
    #          = h(subgroup) - h(subgroup|word)
    #          = log (p(subgroup|word) / p(subgroup))
    # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
    #
    # Calculation of p(subgroup)
    subgroup_prob = term_df.loc[subgroup]['proportion']
    # Apply a function to all words to calculate log p(subgroup|word)
    # The word is indexed by mlb.classes_ ; 
    # we pull out the word using the mlb.classes_ index and then get its count using our main term_df
    # Calculation:
    # p(subgroup|word) = count(subgroup,word) / count(word)
    #                  = x.values             / term_df.loc[mlb.classes_[x.index]]['count']
    pmi_df = pd.DataFrame(df_coo.apply(lambda x: np.log(x.values/term_df.loc[mlb.classes_[x.index]]['count']/subgroup_prob)))
    pmi_df.columns = ['pmi']
    # If all went well, this will be correlated with high frequency words
    # Until normalizing
    # Note: A potentially faster solution for adding count, npmi, can be based on this:
    # #df_test['size_kb'],  df_test['size_mb'], df_test['size_gb'] = zip(*df_test['size'].apply(sizes))
    return pmi_df

In [None]:
def get_nPMI(pmi_df, df_coo):
    normalize_df = pd.DataFrame(df_coo.apply(lambda x: -np.log(x.values/term_df.loc[mlb.classes_[x.index]]['count'] * term_df.loc[mlb.classes_[x.index]]['proportion'])))
    # npmi_df = pmi_df/normalize_df
    npmi_df = pd.DataFrame(pmi_df['pmi']/normalize_df[0])
    npmi_df.columns = ['npmi']
    return npmi_df

In [None]:
def get_count(df_coo, subgroup):
    # TBH I have no clue why this works.
    count_df = pd.DataFrame(df_coo.apply(lambda x: pd.Series(x.values, mlb.classes_[x.index])))
    count_df.columns=['count']
    return count_df

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Makes a sparse vector (shape: # sentences x # words),
# with the count of each word per sentence.
mlb = MultiLabelBinarizer()
df_mlb = pd.DataFrame(mlb.fit_transform(df['tokenized']))

# Calculates PMI metrics
paired_results = pd.DataFrame()
results_dict = {}
for subgroup in (subgroup1, subgroup2):
    # Index of the subgroup word in the sparse vector
    subgroup_idx = np.where(mlb.classes_ == subgroup)[0][0]
    # Dataframe for the subgroup (with counts)
    df_subgroup = df_mlb.iloc[:, subgroup_idx]
    # Create cooccurence matrix for the given subgroup and all other words.
    # Note it also includes the word itself, so that count should maybe be subtracted 
    # (the word will always co-occur with itself)
    print('Calculating co-occurrences')
    df_coo = pd.DataFrame(df_mlb.T.dot(df_subgroup))#.drop(index=subgroup_idx, axis=1)
    print('Getting counts for subgroup...')
    count_df = get_count(df_coo, subgroup)
    print(count_df)
    print('Calculating PMI...')
    pmi_df = get_PMI(df_coo, subgroup)
    print(pmi_df)
    print('Calculating nPMI...')
    #pmi_df_pair[subgroup] = pmi_df
    npmi_df = get_nPMI(pmi_df, df_coo)
    print(npmi_df)
    #results_df = pd.concat([count_df,pmi_df,npmi_df], axis=1)
    paired_results[subgroup + '-pmi']  = pmi_df['pmi']
    paired_results[subgroup + '-npmi'] = npmi_df['npmi']
    paired_results[subgroup + '-count'] = count_df['count'] 

In [None]:
print(paired_results.dropna())

In [None]:
# woman - man: If it's negative, it's man-biased; if it's positive, it's woman positive.
npmi_bias = paired_results[subgroup1 + '-npmi'] - paired_results[subgroup2 + '-npmi'] #pd.DataFrame(results_dict[subgroup1]['npmi'] - results_dict[subgroup2]['npmi']).dropna()
paired_results['npmi_bias'] = npmi_bias.dropna()
paired_results = paired_results.dropna()
#pmi_bias = pd.DataFrame(pmi_df_pair[subgroup1] - pmi_df_pair[subgroup2])

In [None]:
n = 20

In [None]:
print("@%s, the %s bias is:\t%.2f" % (n, subgroup2, np.abs(sum(paired_results.npmi_bias[:n].values))))
print("@%s, the %s bias is:\t%.2f" % (n, subgroup1, sum(paired_results.npmi_bias[-n:].values)))

In [None]:
print("Top %s most %s-biased words" % (n, subgroup2))
paired_results.npmi_bias.sort_values(ascending=True)[:n]

In [None]:
print("Top %s most %s-biased words" % (n,subgroup1))
paired_results.npmi_bias.sort_values(ascending=True)[-n:].sort_values(ascending=False)

In [None]:
print(paired_results.info())

In [None]:
print(paired_results.head())