In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import os

from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Input, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
from tensorflow.keras.callbacks import TensorBoard

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split
print(tf.__version__)

# Load the data

First, we load the data. This are two classes. Every class has it's own file. Our task it to discern clickbait titles from non-clickbait.

In [2]:
datadir = '~/shared'
files = !ls $datadir
files

['glove.6B.100d.txt',
 'glove.6B.200d.txt',
 'glove.6B.300d.txt',
 'glove.6B.50d.txt',
 'glove.6B.zip']

In [None]:
getfiles = ['clickbait_data.txt', 'non_clickbait_data.txt']
if 'clickbait_data.txt' not in files:
    for file in getfiles:
        url = "https://raw.githubusercontent.com/SnehilVerma/Clickbait-Detection/master/{}".format(file)
        req = requests.get(url)
        url_content = req.content
        csv_file = open(file, 'wb')
        csv_file.write(url_content)
        csv_file.close()

In [None]:
file1 = os.path.join(os.path.expanduser(datadir), 'clickbait_data.txt')
click = pd.read_csv(file1, header=None, delimiter='\n', names=['text'])
click['label'] = 1

file2 = os.path.join(os.path.expanduser(datadir), 'non_clickbait_data.txt')
noclick = pd.read_csv(file2, header=None, delimiter='\n',  names=['text'])
noclick['label'] = 0

data = pd.concat([click, noclick], ignore_index=True)


In [None]:
data

We obtain the size of all observations, and define a batchsize.

In [None]:
SIZE = len(data)
BATCH = 32
SIZE

Create a `tf.data.Dataset`. You can feed it the `text` and `label` columns as a single tuple, eg `(data['text'], data['label'])`. After that, shuffle the dataset with `buffer_size=SIZE` and make batch the dataset.

In [None]:
ds = 
ds = 

Check one batch visually with `take(1)`.

Create a train and test set with a 80% split. Remember that your dataset is batched, so you should use `SIZE/BATCH` as the total amount of items.

Use `.take()` and `.skip()` to take the first n observations, and then skip the first n observations to create your sets.

In [None]:
train_n = 
train_ds = 
val_ds = 

Use `.prefetch()` with `tf.data.experimental.AUTOTUNE` to prefetch the data. This speeds up performance.

In [None]:
AUTOTUNE = 
train_ds = 
val_ds = 

# Clean and preprocess the data
We can preprocess the text. First, it would make sense to change everything to lowercase with `tf.strings.lower`, and the to replace the punctuation with `tf.strings.regex_replace`

In [None]:
punctuation = '[%s]' % string.punctuation

def custom_standardization(input_data):
    x = 
    x = 
    return x

custom_standardization(x)

Creater a `TextVectorization` layer. Pick a `vocab_size` and `sequence_length`, and add your `custom_standardization`.

In [None]:
%%time
# Pick a vocabulary size and number of words in a sequence.
vocab_size = 
sequence_length =

# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=
    max_tokens=
    output_mode='int',
    output_sequence_length= 
)

Now use `.adapt` to create the vocabulary.

In [None]:
text_ds = 



Build a model with the following architecture:

- an input layer with `shape=[1]` and `dtype=tf.string`
- your vectorize_layer
- an `Embedding` layer. Set the embedding to 100.
- `GlobalAveragePooling1D`
- one `Dense` layer, with 64 units and `relu`
- a final `Dense` layer with one unit and a `sigmoid`

In [None]:
from tensorflow.keras.layers import Conv1D, Dropout, GlobalAveragePooling1D
model = Sequential([

    
])

Compile it with Adam and a $10^{-4}$ learningrate, with binary_crossentropy as loss. Try to figure out how to add precision and recall to the metrics.
Train for 3 epochs.

In [None]:
model.summary()

In [None]:
from tensorflow.keras.metrics import Precision, Recall

model.compile(optimizer=,
              loss=,
              metrics=[])


model.fit(train_ds, epochs=3, validation_data=val_ds, verbose=1)

# Rotten Tomatoes

Now, let's try something a bit more complex.

We download the data, if it is not present

In [3]:
datadir = '../data'
files = !ls $datadir
files

['cancer_data.csv',
 'cancer_data_uncleaned.csv',
 'clickbait_data.txt',
 'dataset1.csv',
 'dataset2.csv',
 'non_clickbait_data.txt',
 'rotten_tomatoes_movies.csv']

In [None]:
file = 'rotten_tomatoes_movies.csv'
if file not in files:
    url = "https://raw.githubusercontent.com/raoulg/tmoi-ml-20/master/data/rotten_tomatoes_movies.csv"
    req = requests.get(url)
    url_content = req.content
    path = os.path.join(os.path.expanduser(datadir), file)
    csv_file = open(path, 'wb')
    csv_file.write(url_content)
    csv_file.close()

In [None]:
path = os.path.join(os.path.expanduser(datadir), file)
data = pd.read_csv(path)
df = data[['movie_info', 'genres']]
df = df.dropna()
df.head(3)

We have a description of the movie as unstructured text and a set of labels.

Let's check how many different genres we have. Interesting enough, this is a multilabel dataset, meaning that every move can belong to multiple labels at once

In [None]:
flatten = lambda t: [item for sublist in t for item in sublist]
set(flatten([txt.split(", ") for txt in df.genres.values]))

That might be a bit too much. Let's start out with just a subset of the labels. We can always increase the amount of labels to learn.

In [None]:
import re
df['select'] = df.genres.apply(lambda x: re.findall('Science Fiction|Romance|Comedy|Action|Art', x))

It can be usefull to create a one-hot encoding. This way, we can generate a model with as a final layer as much units as we have classes. 

Another option could be to use a "sparse" loss function, but let's just try this out.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X = df['movie_info']
y = mlb.fit_transform(df['select'])
y.shape

In [None]:
y

Now, we want to get rid of every observation that has zero labels.

In [None]:
keep = np.sum(y, axis=1) != 0
X = X[keep]
y = y[keep]
len(X), len(y)

So, we lost about 6000 movies, but we still have enough to make a model. If you want to experiment, you can add more categories and see if you can still get good results. But first, let us visualize the distribution of the labels.

In [None]:
import matplotlib.pyplot as plt

distribution = np.mean(y, axis=0)
# your plot here

Ok, that's not a uniform distribution. But we have all categories covered. So while we might want to add precision and recall to be sure, this will probably work. We might get into problems if we had one category really under-represented (eg 0.01%)

In [None]:
SIZE = len(X)
BATCH = 32

In [None]:
CLASSES = y.shape[1]
CLASSES

Same as before:
- generate datasets from tensor slices
- shuffle and batch
- pick a train-test ratio
- create sets with `take` and `skip`
- prefetch with AUTOTUNE

In [None]:
ds = 
ds = 

train_n = 
train_ds = 
val_ds = 

AUTOTUNE = 
train_ds = 
val_ds = 

In [None]:
for x, y in train_ds.take(1):
  print(x)
  print(y)

So, we have a long review of a move, and multiple genres.

First we set up a `TextVectorization` layer. Pick a sensible size for the `max_tokens` and `output_sequence_length`. If you are unsure of a proper size, test the impact of different sizes.


In [None]:
vocab_size = 
sequence_length = 

# Use the text vectorization layer to normalize, split, and map strings to 
# integers. Note that the layer uses the custom standardization defined above. 
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(




)

Use `adapt` to get the vocabulary.

In [None]:
text_ds = 


Make a model that has:
- InputLayer
- vectorizelayer
- Embedding of dim 50
- GlobalAveragePooling1D
- Dense with 64 units and a relu
- Dense with amount of classes. Don't use and activation in the last layer.

In [None]:
model = Sequential([

    
    
])

Because we didn't use an activation in the last layer, we got "logits" that range from $[-\infty, +\infty]$ instead of values between $[0,1]$ as we would have gotten with a sigmoid activation. Because of this, we have to tell the loss function we need `from_logits` to be `True`.

Try to increase and decrease the predictions by modifying the numbers below. First, decide if you want to get the loss up  or down. Then, modify the prediction. Check if you understand whats happening.

In [None]:
y_true = [[1, 0, 1], [0, 0, 1]]
y_pred = [[5.0, -10.0, 5], [-5.0, -10, 20]]
loss = tf.keras.losses.binary_crossentropy(y_true, y_pred, from_logits=True)
loss.numpy()

Compile the function with Adam and binary_crossentropy with logits.

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])

Train for 20 epochs.

In [None]:
model.fit(
    train_ds,
    validation_data=val_ds, 
    epochs=20)

Let's check the model. We grab the first two texts from our validation dataset.

In [None]:
for text, label in val_ds.take(1):
    print(text[:2])

Use the model to predict.

In [None]:
model.predict(text[:2])

Check the original label

In [None]:
label[:2]

In [5]:
mlb.inverse_transform(label[:2].numpy())
# note that "Science Fiction" is actually "Science Fiction & Fantasy"

To check for ourselves, we can use `inverse_tranform` from the `mlb`. It is interesting how to model actually adds something to the original binary labels. While both examples migth predict correctly a movie to be comedy, the model tells us that it is much more clear from the text that the second one is a comedy (eg with values of 4 versus 12). Also, for a single movie, it can tell you which labels seems to be more likely or dominant. Try for yourself some more examples.

Now, create an architecture with a RNN. use the following:
- an `Input` layer
- your `vectorize_layer`
- an `Embedding` layer
- a type of RNN. Try `GRU` first, with 16 units.
- A final `Dense` layer, without an activation

In [None]:
from tensorflow.keras.layers import LSTM, GRU

model = Sequential([

    
    
])

model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(lr=1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_ds,
          epochs=20,
          validation_data=val_ds,
          verbose=1)

Which one is better? 
What do you hypothesize that is happening?
Discuss with other students and the teacher.