In [1]:
from packages import *
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.data.experimental import AUTOTUNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, r2_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import re
from collections import Counter
%load_ext autoreload
%autoreload 2

# Model
We will be building our models in this notebook.

Some configuration required for tensorflow.

In [2]:
epsilon = 1e-7
fp = 'float32'
tf.config.gpu.set_per_process_memory_growth(True)
tf.config.gpu.set_per_process_memory_fraction(.2)
tf.keras.backend.set_floatx(fp)
tf.keras.backend.set_epsilon(epsilon)
tf.keras.backend.set_image_data_format('channels_last')

## Dataset

In [3]:
batch_size = 16
glove_dim = 50
input_shape = (300, glove_dim, 1)
max_length = 300
shuffle_buffer_size = batch_size*4
prefetch_buffer_size = 1
random_seed = np.random.randint(0, 100)
test_ratio = 0.33

In [4]:
def shuffle_sample(dictionary, n_sample=None, random_seed=42):
    lens = [len(l) for l in dictionary.values()]
    assert min(lens) == max(lens)
    n_data = lens[0]
    processed = {}
    for key, array in dictionary.items():
        if n_sample is not None:
            processed[key] = shuffle(array, random_state=random_seed)[:n_sample]
        else:
            processed[key] = shuffle(array, random_state=random_seed)[:n_sample]
    return processed

In [5]:
data = shuffle_sample(load_data({'review': ['text', 'stars']})['review'],
                             random_seed=random_seed
                            )
x_train, x_test, y_train, y_test = train_test_split(data['text'],
                                                    data['stars'],
                                                    test_size = test_ratio,
                                                   )

In [30]:
n = len(x_train)
num_batches = n // batch_size 

In [8]:
glove_lookup = load_pickle(os.path.join(GLOVE_DIR, 'glove-{}D.pkl'.format(glove_dim)))
unk_key = '<UNK>'

In [9]:
unk_vector = np.mean(np.array(list(glove_lookup.values())), axis=0)
glove_lookup[unk_key] = unk_vector

In [10]:
train_dataset_text = tf.data.Dataset.from_tensor_slices(x_train)
test_dataset_text = tf.data.Dataset.from_tensor_slices(x_test)

train_dataset_stars = tf.data.Dataset.from_tensor_slices(y_train - 1) # to make stars 0-indexed
test_dataset_stars = tf.data.Dataset.from_tensor_slices(y_test - 1)

In [11]:
def clean_token(t):
    t = t.lower()
    m = re.match('^[^\w\'](\w+).*', t)
    if m is not None:
        return m.group(1)
    else:
        return t

def tokenize(s):
    tokens = []
    for t in word_tokenize(s):
        tokens.append(clean_token(t))
    return tokens

# Returns an np.array of glove embeddings for each
# word in the given string of shape (word_count, glove_dims)
def get_glove_embeddings(tokens):
    embeddings = []
    
    for i, word in enumerate(tokens):
#         if i > 
        if word in glove_lookup:
            embeddings.append(glove_lookup[word])
        else:
            embeddings.append(np.zeros(glove_dim))
            
#     return(np.array(embeddings, dtype=np.float16))
    return(np.array(embeddings, dtype=np.float32))

def test_embed(arr):
    return get_glove_embeddings(tokenize(arr))

def embed(tensor):
    return get_glove_embeddings(tokenize(str(tensor.numpy())))

In [13]:
# %%time
# for i in range(128 * 100):
#     _=test_embed(x_train[i])

In [14]:
@tf.function
def fix_dimensions(tensor):
    return tf.image.resize_image_with_crop_or_pad(tf.expand_dims(tensor, axis=-1), 300, glove_dim)

In [15]:
train_dataset_embed = tf.data.Dataset.map(train_dataset_text,  lambda review: tf.py_function( embed, [review], tf.float32 ), num_parallel_calls=AUTOTUNE) 
test_dataset_embed = tf.data.Dataset.map(test_dataset_text,  lambda review: tf.py_function( embed, [review], tf.float32 ),num_parallel_calls=AUTOTUNE) 

In [16]:
train_dataset_embed = tf.data.Dataset.map(train_dataset_embed, fix_dimensions, num_parallel_calls=AUTOTUNE) 
test_dataset_embed = tf.data.Dataset.map(test_dataset_embed, fix_dimensions, num_parallel_calls=AUTOTUNE) 

In [17]:
train_dataset = tf.data.Dataset.zip((train_dataset_embed, train_dataset_stars))
test_dataset = tf.data.Dataset.zip((test_dataset_embed, test_dataset_stars))

In [18]:
train_dataset = tf.data.Dataset.shuffle(train_dataset, buffer_size=shuffle_buffer_size)
test_dataset = tf.data.Dataset.shuffle(test_dataset, buffer_size=shuffle_buffer_size)

In [19]:
train_dataset = tf.data.Dataset.batch(train_dataset, batch_size=batch_size)
test_dataset = tf.data.Dataset.batch(test_dataset, batch_size=batch_size)

In [20]:
train_dataset = tf.data.Dataset.prefetch(train_dataset, buffer_size=prefetch_buffer_size)
test_dataset = tf.data.Dataset.prefetch(test_dataset, buffer_size=prefetch_buffer_size)

In [34]:
X = []
Y = []
for x_batch, y_batch in train_dataset.take(1000 // batch_size):
    X.append(x_batch)
    Y.append(y_batch)

KeyboardInterrupt: 

In [35]:
len(X)

2357

In [25]:
y_batch

<tf.Tensor: id=855, shape=(16,), dtype=int16, numpy=array([4, 4, 0, 4, 2, 4, 4, 4, 1, 4, 3, 2, 0, 2, 4, 4], dtype=int16)>

6250

In [32]:
num_batches * batch_size

4479552

In [27]:
x_batch.numpy()

array([[[[ 0.      ],
         [ 0.      ],
         [ 0.      ],
         ...,
         [ 0.      ],
         [ 0.      ],
         [ 0.      ]],

        [[ 0.      ],
         [ 0.      ],
         [ 0.      ],
         ...,
         [ 0.      ],
         [ 0.      ],
         [ 0.      ]],

        [[ 0.      ],
         [ 0.      ],
         [ 0.      ],
         ...,
         [ 0.      ],
         [ 0.      ],
         [ 0.      ]],

        ...,

        [[ 0.      ],
         [ 0.      ],
         [ 0.      ],
         ...,
         [ 0.      ],
         [ 0.      ],
         [ 0.      ]],

        [[ 0.      ],
         [ 0.      ],
         [ 0.      ],
         ...,
         [ 0.      ],
         [ 0.      ],
         [ 0.      ]],

        [[ 0.      ],
         [ 0.      ],
         [ 0.      ],
         ...,
         [ 0.      ],
         [ 0.      ],
         [ 0.      ]]],


       [[[ 0.08085 ],
         [-0.43118 ],
         [ 0.15614 ],
         ...,
         [-0.028

## Models

### Baseline

In [18]:
class BaselineModel:
    def __init__(self):
        self.model = LinearSVC()
        self.vectorizer = CountVectorizer(max_features = 400000) #number of entries in GLoVe
        
    def train(self, reviews, stars):
        x = self.vectorizer.fit_transform(reviews)
        self.model.fit(x, stars)

    def predict(self, reviews):
        x = self.vectorizer.transform(reviews)
        predictions = self.model.predict(x)
        return predictions
        

In [None]:
for x, y in

In [50]:
n_train = 1000000

In [47]:
monster = BaselineModel()

In [None]:
monster.train(x_train[:n_train], y_train[:n_train])

In [None]:
predictions = monster.predict(x_test[:n_train])

### BabyBlueberry

In [19]:
class BabyBlueberry:

    def build(input_shape = input_shape, star_units=5):
        inputs = Input(shape=input_shape)
        x = Conv2D(16, (3, 3), activation='relu', name='block1_conv1', padding='same')(inputs)
        x = Conv2D(16, (3, 3), activation='relu', name='block1_conv2', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
        x = Conv2D(32, (3, 3), activation='relu', name='block2_conv1', padding='same')(x)
        x = Conv2D(32, (3, 3), activation='relu', name='block2_conv2', padding='same')(x)
        x = Conv2D(32, (3, 3), activation='relu', name='block2_conv3', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
        x = Conv2D(64, (3, 3), activation='relu', name='block3_conv1', padding='same')(x)
        x = Conv2D(64, (3, 3), activation='relu', name='block3_conv2', padding='same')(x)
        x = Conv2D(64, (3, 3), activation='relu', name='block3_conv3', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x) 
        x = Conv2D(128, (3, 3), activation='relu', name='block4_conv1', padding='same')(x)
        x = Conv2D(128, (3, 3), activation='relu', name='block4_conv2', padding='same')(x)
        x = Conv2D(128, (3, 3), activation='relu', name='block4_conv3', padding='same')(x)
        x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x) 
        x = Flatten(name='flatten')(x)
        x = Dropout(rate=.2, name='dropout')(x)
        out = Dense(star_units, activation='softmax', name='output')(x)
        return tf.keras.models.Model(inputs=inputs, outputs=out)
        
    @tf.function
    def loss_fn(truth, logits):
        loss = tf.keras.losses.sparse_categorical_crossentropy(truth, logits)
        return loss

In [53]:
%%time
for i in train_dataset.take(100):
    pass

CPU times: user 56.7 s, sys: 19.7 s, total: 1min 16s
Wall time: 42.7 s


In [20]:
baby = BabyBlueberry.build()

In [21]:
adam = tf.keras.optimizers.Adam()

In [22]:
baby.compile(optimizer=adam, metrics=['accuracy'] ,loss=BabyBlueberry.loss_fn)

In [23]:
baby.fit(train_dataset, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f34fbf2ffd0>

In [24]:
baby.evaluate(test_dataset.take(1000))



[0.8017106358408927, 0.16546875]

In [25]:
baby.evaluate(train_dataset.take(10))



[0.7470987379550934, 0.175]

In [126]:
# dists, ix = neighbors(glove_lookup['female'] + glove_lookup['king'], glove_lookup)

In [125]:
# dists[dists.columns[ix[::-1]]]