In [49]:
from collections import Counter
import gc
import json
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import spacy

In [14]:
def load_glove(dim, glove_dir):
    """Load glove vectors into a dictionary mapping word to vector.
    
    Parameters
    -----------
    dim: int
        Size of embedding. One of (50, 100, 200, 300).
    glove_dir: str
        Path to directory containing glove files.
        
    Returns
    --------
    Dictionary where keys are words and values are {dim}-dimensional ndarrays.
    """
    w2vec = dict()
    path = os.path.join(glove_dir, f'glove.6B.{dim}d.txt')
    with open(path, 'r') as f:
        for row in f:
            items = row.split()
            w2vec[items[0]] = np.array(items[1:], dtype=float)
    return w2vec

In [15]:
def save_pickle(obj, fname, dir_name='data'):
    """Wrapper to quickly save a pickled object."""
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
    path = os.path.join(dir_name, f'{fname}.pkl')
    with open(path, 'wb') as f:
        pickle.dump(obj, f)
    print(f'Data written to {path}.')

In [16]:
def load_pickle(fname, dir_name='data'):
    """Wrapper to quickly load a pickled object."""
    with open(os.path.join(dir_name, f'{fname}.pkl'), 'rb') as f:
        data = pickle.load(f)
    return data

In [17]:
def train_val_test_split(x, y, train_p, val_p, state=1, shuffle=True):
    """Wrapper to split data into train, validation, and test sets.
    
    Parameters
    -----------
    x: pd.DataFrame, np.ndarray
        Features
    y: pd.DataFrame, np.ndarray
        Labels
    train_p: float
        Percent of data to assign to train set.
    val_p: float
        Percent of data to assign to validation set.
    state: int or None
        Int will make the split repeatable. None will give a different random
        split each time.
    shuffle: bool
        If True, randomly shuffle the data before splitting.
    """
    test_p = 1 - val_p/(1-train_p)
    x_train, x_test, y_train, y_test = train_test_split(x, 
                                                        y,
                                                        train_size=train_p,
                                                        shuffle=shuffle,
                                                        random_state=state)
    x_val, x_test, y_val, y_test = train_test_split(x_test, 
                                                    y_test,
                                                    test_size=test_p,
                                                    random_state=state)
    return x_train, x_val, x_test, y_train, y_val, y_test

In [46]:
dtypes = dict(text=object, sex='category', age=np.int8)
df = pd.read_csv('data/sentences.csv', dtype=dtypes, usecols=dtypes.keys())
df.shape

(7106744, 3)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8730028 entries, 0 to 8730027
Data columns (total 3 columns):
text    object
sex     category
age     int8
dtypes: category(1), int8(1), object(1)
memory usage: 83.3+ MB


In [24]:
lengths = df.text.str.split().str.len()

In [25]:
lengths.quantile([0, .05, .1, .2, .5, .75, .9, .95, .98, .99, 1])

0.00        1.0
0.05        2.0
0.10        3.0
0.20        5.0
0.50       12.0
0.75       20.0
0.90       29.0
0.95       37.0
0.98       51.0
0.99       67.0
1.00    10276.0
Name: text, dtype: float64

In [28]:
df = df[(lengths >= 5) & (lengths <= 50)]
df.shape

(7106744, 3)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7106744 entries, 0 to 8730027
Data columns (total 3 columns):
text    object
sex     category
age     int8
dtypes: category(1), int8(1), object(1)
memory usage: 122.0+ MB


In [50]:
# Note: tokenizer is faster but leaves punctuation attached to words.
# NLTK tokenizer has issues with ellipses. Must disable parser, tagger, and 
# ner in nlp() when working with the whole dataset to avoid memory issues.
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 600_000_000

In [56]:
# Split data into train, validation, and test sets.
data = train_val_test_split(df.text, df[['sex', 'age']], train_p=.96, 
                            val_p=.02, shuffle=True, state=1)
x_train, x_val, x_test, y_train, y_val, y_test = data

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

(6822474,) (6822474, 2)
(142134,) (142134, 2)
(142136,) (142136, 2)


In [52]:
save_pickle(data, 'split_data')

Data written to data/split_data.pkl.


In [54]:
def build_word_mappings(x_train, nlp, glove_dir):
    """Generate word to count, word to index, and word to vector mappings."""
    # Map each token to the # of times it appears in the corpus.
    tokens = [item for t in nlp(' '.join(x_train.values),
                                disable=['parser', 'tagger', 'ner'])
              for item in [t.text.strip()] if item]
    w2count = dict(filter(lambda x: x[1] > 4, Counter(tokens).items()))
    save_pickle(w2count, 'w2count')

    # Construct w2idx dict and i2w list.
    w2idx = {k: i for i, (k, v) in
             enumerate(sorted(w2count.items(), key=lambda x: x[1]), 2)}
    w2idx['<PAD>'] = 0
    w2idx['<UNK>'] = 1
    i2w = [k for k, v in sorted(w2idx.items(), key=lambda x: x[1])]
    save_pickle(w2idx, 'w2idx')
    save_pickle(i2w, 'i2w')

    # Load word vectors and filter to include words in our vocab.
    w2vec = load_glove(300, glove_dir)
    w2vec = {k: v for k, v in w2vec.items() if k in w2idx}
    save_pickle(w2vec, 'w2vec')

In [36]:
tokens = [item for t in nlp(' '.join(x_train.values), 
                            disable=['parser', 'tagger', 'ner'])
          for item in [t.text.strip()] if item]

In [37]:
w2count = dict(filter(lambda x: x[1] > 4, Counter(tokens).items()))
len(w2count)

190295

In [38]:
w2idx = {k: i for i, (k, v) in 
         enumerate(sorted(w2count.items(), key=lambda x: x[1]), 2)}
w2idx['<PAD>'] = 0
w2idx['<UNK>'] = 1

In [39]:
i2w = [k for k, v in sorted(w2idx.items(), key=lambda x: x[1])]

In [40]:
glove_dir = '/Users/hmamin/data/glove/'

# Load word vectors and filter to include words in our vocab.
w2vec = load_glove(300, glove_dir)
w2vec = {k: v for k, v in w2vec.items() if k in w2idx}
len(w2vec)

73985

In [41]:
save_pickle(tokens, 'tokens')
save_pickle(w2count, 'w2count')
save_pickle(w2idx, 'w2idx')
save_pickle(w2vec, 'w2vec')

Data written to data/tokens.pkl.
Data written to data/w2count.pkl.
Data written to data/w2idx.pkl.
Data written to data/w2vec.pkl.


## Re-load word mappings

In [42]:
def encode(text, w2idx, nlp):
    """Map each word in a post to its index in the embedding matrix. Posts
    retain their original lengths for now.
    """
    unk = w2idx['<UNK>']
    return [w2idx.get(word.text, unk) 
            for word in nlp(text, disable=['parser', 'tagger', 'ner'])]

In [43]:
tokens = load_pickle('tokens')
w2count = load_pickle('w2count')
w2idx = load_pickle('w2idx')
w2vec = load_pickle('w2vec')

In [44]:
encode(x_train.values[0], w2idx, nlp)

[179660,
 190256,
 190281,
 190146,
 189840,
 91279,
 190254,
 190290,
 181001,
 187235,
 190296]

In [45]:
# Only have pre-trained vectors for 38.9% of the unique vocab for our corpus.
# However, this makes up 86.7% of the total words in our corpus.
pretrained_pct_unique = np.mean([w1 in w2vec for w1 in w2idx])
pretrained_pct_total = np.mean([t in w2vec for t in tokens])

print(round(pretrained_pct_unique, 3))
print(round(pretrained_pct_total, 3))

0.389
0.867


## Think about how to deal with words in w2idx but not w2vec; can just ignore, but not ideal. Maybe init to zero in embedding matrix and make emb trainable, but zero the grads for the pre-trained vectors (see stackoverflow: https://stackoverflow.com/questions/54924582/is-it-possible-to-freeze-only-certain-embedding-weights-in-the-embedding-layer-i/54952825#54952825).