# Preprocessing

We decided to preprocess the dataset once prior to training to enhance the quality of our data and have it in a ready-to-train format. This included things such as getting rid of HTML markups, replacing accented letters such as à/á/â with their counterparts, and replacing individual words with indices. Finally we serialized and exported the dataset in Tensorflow's binary file format .tfrecords for ease of usability during training. 

## Table of Contents
1. [Preprocessing](#Preprocessing)
2. [Serializing and exporting](#Serializing-and-exporting)

In [1]:
from packages import *
from nltk.tokenize import word_tokenize
import re
import unicodedata
import pickle
import concurrent.futures
import tensorflow as tf
from tensorflow.data.experimental import AUTOTUNE
%load_ext autoreload
%autoreload 2

## Preprocessing

In [7]:
glove_dim = 50
tokens_to_keep = set( load_pickle(os.path.join(GLOVE_DIR, 'glove-{}D.pkl'.format(glove_dim))).keys())
data = load_data({'review': ['text']})['review']['text']
review[12]

"Wow. So surprised at the one and two star reviews!  We started with the most tender calamari. Although the marinara sauce was a bit bland, but a touch of salt made it just right. My husband had the veal with peppers and said it was so delicious and tender. The mashed potatoes were perfect. I had the salmon Diablo which was also delicious. Our salad was beautiful! Dressing was served on the salad and it was a nice amount. We ended our delicious meal with a piece of tiramisu. Our server Matt was right on!! Very pleasant and knowledgeable about the menu. Our appetizer, salad and entrees were timed perfectly. I love salad and did not mind that my entree was served while I was still eating it! No problem it let my dinner cool to just the right temp for me to eat it comfortably. \nI wonder sometimes if people just don't appreciate relaxing and taking time to eat a wonderful and beautifully prepared meal.  A wonderful atmosphere. So relaxing. The chairs are super comfortable too!!! We will c

In [5]:
# token_lengths = [(len(token), token) for token in tokens_to_keep]
# token_lengths.sort()
# token_lengths = token_lengths[::-1]

In [6]:
max_token_length = 16
review_length = 300
dtype = '|S{}'.format(max_token_length) #itemsize = 16

In [7]:
NULL_KEY in tokens_to_keep, UNK_KEY in tokens_to_keep

(False, False)

In [8]:
def clean_token(t):
    t = unicodedata.normalize('NFKD', t).encode('ascii', 'ignore').decode() #sometimes len() increases after normalization
    if len(t) > max_token_length:
        return UNK_KEY
    t = t.lower()
    if t not in tokens_to_keep:
        return UNK_KEY
    m = re.match('^[^\w\'](\w+).*', t)
    if m is not None:
        t = m.group(1)
    return t

def upsample(tokens):
    # upsampling strategy is to pad with NULL_KEY's after the last token.
    shortage = review_length - len(tokens)
    tokens += [NULL_KEY] * shortage
    return tokens

def downsample(tokens):
    # dowsampling strategy is to remove all UNK_KEY's before touching other tokens
    excess = len(tokens) - review_length
    downsampled_tokens = []
    for i, t in enumerate(tokens):
        if excess == 0:
            downsampled_tokens += tokens[i:]
            return downsampled_tokens
        if t == UNK_KEY:
            excess-=1
        else:
            downsampled_tokens.append(t)
    if excess == 0:
        downsampled_tokens += tokens[i+1:]
        return downsampled_tokens       
    # if we reach here, it means previous attempt didn't get rid of all of the excess, and we will truncate both ends
    beg = excess // 2
    end = excess - beg
    downsampled_tokens = downsampled_tokens[beg:-end]
    return downsampled_tokens

def tokenize(review):
    tokens = [clean_token(t) for t in word_tokenize(review)]
    if len(tokens) < review_length:
        tokens = upsample(tokens)
    elif len(tokens) > review_length:
        tokens = downsample(tokens)
    return np.array(tokens, dtype=dtype)

def preprocess(review):
    return tokenize(review)

In [None]:
data_preprocessed = []
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    data_preprocessed = executor.map(preprocess, data)
data_preprocessed = [i for i in data_preprocessed]

In [33]:
N = len(data_preprocessed)
splits = 100
per_split = N // splits

In [None]:
for s in range(splits):
    x = np.array(data_preprocessed[s * per_split: (s+1) * per_split])
    np.save(os.path.join(DATASET_DIR, 'preprocessed', 'review-text-{:02d}.npy'.format(s)), x,
            allow_pickle=False)

## Serializing and exporting

In [2]:
epsilon = 1e-7
fp = 'float32'
tf.config.gpu.set_per_process_memory_growth(True)
tf.config.gpu.set_per_process_memory_fraction(.05)
tf.keras.backend.set_floatx(fp)
tf.keras.backend.set_epsilon(epsilon)
tf.keras.backend.set_image_data_format('channels_last')

In [22]:
to_ix = True
if to_ix:
    all_key_to_ix = load_pickle(os.path.join(GLOVE_DIR, 'glove-byte-float32_to_ix.pkl'))
    j = [2]
    keys_to_ix = {}
    keys_to_ix[NULL_KEY.encode()] = 0
    keys_to_ix[UNK_KEY.encode()] = 1 
#ASSUMING ix 0 is for NULL and 1 for UNK

In [23]:
# Helperfunctions to make your feature definition more readable
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[*value]))

def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[*value]))

In [24]:
def serialize_example(review, to_ix):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
    
    
    
    feature = {
      'review': _int64_feature(review) if to_ix else _bytes_feature(review),
    }

    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [25]:
chunk_count = 100

In [31]:
def _to_ix(review):
    ix = []
    for word in review:
        if word in keys_to_ix:
            ix.append(keys_to_ix[word])
        else:
            keys_to_ix[word] = j[0]
            ix.append(j[0])
            j[0]+=1
    return ix   

In [32]:
def _serialize(review, to_ix=False):
    if to_ix:
        review = _to_ix(review)
    return serialize_example(review, to_ix)

In [8]:
review_paths = [os.path.join(DATASET_DIR, 'preprocessed', 'npy', 'review-text-{:02d}.npy'.format(i)) for i in range(chunk_count)] #100 files

In [30]:
_parse_function(_serialize(review[12], to_ix=to_ix), to_ix=True)

{'review': <tf.Tensor: id=4, shape=(300,), dtype=int64, numpy=
 array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,   7,  17,  18,  19,   3,  20,   7,  21,  22,  23,  24,
         25,  26,  27,  28,  24,  29,  30,  31,  32,  33,  34,  35,   3,
         36,  37,  38,   7,  39,  16,  40,   9,  41,  33,  23,   4,  42,
          9,  18,   3,   7,  43,  44,  45,  46,   3,  47,  38,   7,  48,
         49,  50,  23,  51,  42,   3,  52,  53,  23,  54,  13,  55,  23,
         56,  57,   7,  53,   9,  33,  23,  24,  58,  59,   3,  14,  60,
         52,  42,  61,  16,  24,  62,  30,  63,   3,  52,  64,  65,  23,
         35,  57,  13,  13,  66,  67,   9,  68,  69,   7,  70,   3,  52,
         71,  27,  53,   9,  72,  45,  73,  74,   3,  47,  75,  53,   9,
         76,  77,  78,  79,  36,  80,  23,  56,  81,  47,  23,  82,  83,
         33,  13,  84,  85,  33,  86,  36,  87,  88,  89,  34,   7,  35,
         90,  91,  92,  89,  93,  33,  94,   3,  47,  95,  96

In [9]:
for i in range(chunk_count):
    chunk = np.load(review_paths[i],allow_pickle=False)
    break
    chunk_tf = [_serialize(x, to_ix = to_ix) for x in chunk]
    chunk_ds = tf.data.Dataset.from_tensor_slices(chunk_tf)
    writer_path = os.path.join(DATASET_DIR, 'preprocessed', 'tfrecord', 'ix' if to_ix else 'xext',
                               'review-{}-{:02d}.tf'.format('ix' if to_ix else 'text', i))
    writer = tf.data.experimental.TFRecordWriter(writer_path)
    writer.write(chunk_ds)
    print(i+1, end=', ')
if to_ix:
    save_pickle(os.path.join(GLOVE_DIR, 'glove-byte-keys_to_ix.pkl'), keys_to_ix)

In [28]:
def _parse_function(proto, to_ix):
    # define your tfrecord again. Remember that you saved your image as a string.
    keys_to_features = {'review': tf.io.FixedLenFeature([300,], tf.int64) if to_ix else tf.io.FixedLenFeature([300,], tf.string),}
    
    # Load one example
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)
    
    return parsed_features 