In [None]:
#%pip uninstall protobuf -y
#%pip install protobuf==3.20.1

# After running the above commands, please restart the kernel BEFORE running the import statements below.

# conda install pandas
#%pip install tensorflow-macos
#%pip install tensorflow-metal
#%pip install tensorflow-datasets
#%pip install tensorflow
#%pip install scikit-learn
import pandas                      as pd
import tensorflow                  as tf  # type: ignore
from   tensorflow              import keras # type: ignore
from   sklearn.model_selection import train_test_split

tf.random.set_seed(42)

## IMDB movie reviews

## Retrieving and preparing the Data

We will work with the IMDb movie reviews data.

In [None]:
# question 1
# Read in the IMDB Dataset into "data". Do not set an index column
data = pd.read_csv('files/IMDB Dataset.csv')

In [None]:
data.head()

In [None]:
# question 2
# Replace all "negative" and "positive" sentiment values with 0 and 1 respectively.
# You can use a simple logical operator instead of label encoding.
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
# question 3
# Get the dependent data and assign to y
y = data['sentiment']   
print(y[0:10])

In [None]:
# question 4
# Split the X data (data['review']) and y data into X_train, X_test, y_train, and y_test
# With a test size of 0.2 and a random_state of 42
X_train, X_test, y_train, y_test = train_test_split(data['review'], y, test_size=0.2, random_state=42)

In [None]:
print(f"""
Train samples: {X_train.shape[0]}
Test samples: {X_test.shape[0]}
"""
)

In [None]:
y_train

Inspect the frequence of each sentiment in the traning dataset (it is balanced!)

In [None]:
# question 5
# Calculate the training data's frequency and assign the output to "frequency"
frequency = y_train.value_counts() / y_train.shape[0]
print(frequency)

In [None]:
# cell 6
# Let's turn the target into a dummy vector
y_train = pd.get_dummies(y_train).to_numpy()
y_test  = pd.get_dummies(y_test).to_numpy()

In [None]:
y_train.shape

## Unigram Multi-hot Encoding Baseline

Next, let us see the performance of a neural net that is trained from the scratch using multi-hot encoding. 

In [None]:
from tensorflow.keras.layers import TextVectorization # type: ignore

# Set the maximum number of tokens to 2412. 
# Also set up our Text Vectorization layer using multi-hot encoding
max_tokens      = 2412
text_vectorization = TextVectorization(max_tokens  = max_tokens, 
                                    output_mode = 'multi_hot') 

In [None]:
# The vocabulary that will be indexed is given by the text corpus on our train dataset
text_vectorization.adapt(X_train)

In [None]:
# Question 7
# We vectorize our input
X_train = text_vectorization(X_train)
X_test  = text_vectorization(X_test)

In [None]:
# Question 8
# Now create your model. start with 32 dense relu layers, a dropout layer of 0.5, and a final softmax layer
inputs  = keras.Input(shape=(max_tokens, ))
x       = keras.layers.Dense(32, activation="relu")(inputs)
x       = keras.layers.Dropout(0.5)(x)
outputs = keras.layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.summary()

In [None]:
# Compile your model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Fit model
# Use one-hot encoded y for training and testing
model.fit(
    x              = X_train, 
    y              = y_train.astype('float32'), 
    validation_data= (X_test, 
                      y_test.astype('float32')),
    epochs         = 5
)


In [None]:
# Question 9
# Evaluate your model. You should be able to get your model to 85% at this point
# Use one-hot encoded y for evaluation as well
model.evaluate(x=X_test, y=y_test.astype('float32'))[1] > 0.85

## Extend Baseline Model

Let's create more complex models to increase the accuracy on our test sample. Try combining different models by changing:
- Number of hidden units
- Adding another hidden layer.
- Changing the number of epochs.
- Using bigrams instead of unigrams.

To guide your search for the best parameters, note how the accuracy changes on both train and test data.

In [None]:
embedding_dim = 64
hidden_units  = 32
num_classes   = y_train.shape[1]  # Use y_train, which is one-hot encoded
model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(max_tokens,)),  # Use max_tokens as input shape
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [None]:
import tensorflow as tf

cifar = tf.keras.datasets.cifar100
(x_train, y_train), (x_test, y_test) = cifar.load_data()
model = tf.keras.applications.ResNet50(
    include_top=True,
    weights=None,
    input_shape=(32, 32, 3),
    classes=100,)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
model.fit(x_train, y_train, epochs=5, batch_size=64)