In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# This needs to be defined by the drop down
subgroup = "woman"

In [3]:
data = load_dataset("c4", "en", split= "train", streaming = True)

In [4]:
grab_n = 5000
# For streaming data
print('Note: Just taking the first %s instances.' % grab_n)
data_head = data.take(grab_n)
df = pd.DataFrame(data_head)
# If not streaming, use:
#df = pd.json_normalize(data)

Note: Just taking the first 1000 instances.


In [5]:
def count_vocab_frequencies(df):
    """
    Based on an input pandas DataFrame with a 'text' column, 
    this function will count the occurrences of all words
    with a frequency higher than 'cutoff' and will return another DataFrame
    with the rows corresponding to the different vocabulary words
    and the column to the count count of that word.
    """
    # Move this up as a constant in larger code.
    batch_size = 10
    
    # We do this to calculate per-word statistics
    df['text'] = df['text'].str.lower()
    # Regex for pulling out single words
    cvec = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b", lowercase=True)
    
    # We also do this because we need to have the tokenization per sentence 
    # so that we can look at co-occurrences of words across sentences for nPMI calculation
    sent_tokenizer = cvec.build_tokenizer()
    df['tokenized'] = df.text.apply(sent_tokenizer)
    
    # Fast calculation of single word counts
    cvec.fit(df.text)
    document_matrix = cvec.transform(df.text)
    batches = np.linspace(0, df.shape[0], batch_size).astype(int)
    i = 0
    tf = []
    while i < len(batches) - 1:
        batch_result = np.sum(document_matrix[batches[i]:batches[i+1]].toarray(), axis=0)
        tf.append(batch_result)
        i += 1
    term_freq_df = pd.DataFrame([np.sum(tf, axis=0)], columns=cvec.get_feature_names()).transpose()
    
    # Now organize everything into the dataframes
    term_freq_df.columns = ['count']
    term_freq_df.index.name = 'word'
    sorted_term_freq_df = pd.DataFrame(term_freq_df.sort_values(by='count', ascending=False)['count'])
    return sorted_term_freq_df, df

In [6]:
term_df, df = count_vocab_frequencies(df)
# p(word).  Note that multiple occurrences of a word in a sentence increases its probability.
term_df['proportion'] = term_df['count']/float(sum(term_df['count']))
# Sanity check
print(term_df.head())
print(term_df.tail())

      count  id  proportion  rank
word                             
the   19063   0    0.050159     1
to    11027   1    0.029015     2
and   10996   2    0.028933     3
of     8792   3    0.023134     4
a      8496   4    0.022355     5
             count     id  proportion  rank
word                                       
elmo             1  27227    0.000003   343
ellinor          1  27228    0.000003   343
peloponnese      1  27229    0.000003   343
pem              1  27230    0.000003   343
赵维山              1  27231    0.000003   343


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# Makes a sparse vector (shape: # sentences x # words),
# with the count of each word per sentence.
mlb = MultiLabelBinarizer()
df_mlb = pd.DataFrame(mlb.fit_transform(df['tokenized']))
# Index of the subgroup word in the sparse vector
subgroup_idx = np.where(mlb.classes_ == subgroup)[0][0]
# Dataframe for the subgroup (with counts)
df_subgroup = df_mlb.iloc[:, subgroup_idx]
# Create cooccurence matrix for the given subgroup and all other words.
# Note it also includes the word itself, so that count should be subtracted 
# (the word will always co-occur with itself)
df_coo = pd.DataFrame(df_mlb.T.dot(df_subgroup))

In [9]:
# PMI(x;y) = h(y) - h(y|x)
#          = h(subgroup) - h(subgroup|word)
#          = log p(subgroup|word) - log p(subgroup))

# log p(subgroup)
subgroup_prob = np.log(term_df.loc[subgroup]['proportion'])
# Apply a function to all words to calculate log p(subgroup|word)
# The word is indexed by mlb.classes_ ; 
# we pull out the word using the index and then get its count using our main term_df
# x[1] is the count of the word, given the subgroup
pmi_df = pd.DataFrame(df_coo.apply(lambda x: np.log(x[1]/term_df.loc[mlb.classes_[x.index]]['count']) - subgroup_prob))

In [24]:
# If all went well, this will be correlated with high/low frequency words
# Until normalizing
print(pmi_df.sort_values(by=[0])[:50])

              0
word           
the   -0.671011
to    -0.123608
and   -0.120793
of     0.102896
a      0.137143
in     0.400485
is     0.758101
for    0.814873
you    0.828174
that   0.951522
it     1.055909
i      1.113900
with   1.127750
on     1.244979
s      1.381876
are    1.415961
be     1.446006
as     1.446442
your   1.495123
this   1.513133
we     1.694523
have   1.699002
or     1.702938
at     1.724579
can    1.748466
was    1.774147
from   1.780215
will   1.833336
by     1.889437
not    1.970726
an     2.045627
all    2.099430
but    2.127457
they   2.135239
if     2.175989
has    2.223198
our    2.223198
more   2.249123
my     2.266788
one    2.281751
so     2.290837
their  2.322782
t      2.391028
about  2.432223
what   2.463068
up     2.482533
he     2.525200
there  2.525200
time   2.540704
also   2.544618
