# Welcome !

> Let's see what we can do together. Let's take a look at some sentiment analysis.

***To detect good sentences from bad we will use a labeled dataset of 3000 sentences.***
<br>
<img src="https://images7.alphacoders.com/132/1325363.png" alt="Mario Star" width="500"/>

### What is it?
Let's say we have for example: **"I love this cat"** and **"I love this dog"**, **"I hate the movie"**
<br>
We can see that the first two sentences are positive and the last one is negative. Therefore, we can say:
**I love this cat** is **1** and **I hate the movie** is **0**.
<br>
We deal with 2 emotions: **positive** and **negative**. We can also say that we work in a one-dimensional space: **0** to **1**.


| When you see a dragon ball, it means it's your time to shine and to code!


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

# 1. Take a look at the data

In [None]:
def read_data() -> list:
    path: str = os.path.join(os.getcwd(), 'dataset')
    if not os.path.exists(path):
        raise FileNotFoundError('Dataset not found')
    labeled_sentences: list = []
    for filename in os.listdir(path):
        if filename.endswith('.txt'):
            with open(os.path.join(path, filename), 'r') as f:
                for line in f:
                    labeled_sentences.append(line.strip().split('\t'))
    return labeled_sentences

In [None]:
def get_dataframe(labeled_sentences: list) -> pd.DataFrame:
    df: pd.DataFrame = pd.DataFrame(labeled_sentences, columns=['sentence', 'label'])
    return df

In [None]:
df = get_dataframe(read_data())
df.head()

# Preprocessing

In [None]:
def preprocess_data():
    sentences: list = df['sentence'].values.tolist()
    labels: list = df['label'].values.tolist()
    return sentences, labels

In [None]:
sentences, labels = preprocess_data()
print(sentences[:1])
print(labels[:1])

# Tokenization and Padding

**Let's code!** <img src="https://i.pinimg.com/originals/88/3c/ac/883cacfc0a39afa24693ac441e5bdbec.png" width="200"/> **Let's feed the dragon!**

Use the `tf.keras.preprocessing.text.Tokenizer` to tokenize the sentences.
Then, we want to fit the tokenizer on the sentences.

In [None]:
def tokenize(sentences: list) -> tf.keras.preprocessing.text.Tokenizer:
    pass

We want to pad the sentences to the same length. We will use the median of the lengths of the sentences.

In [None]:
def padding() -> int:
    lengths: list = [len(sentence.split()) for sentence in sentences]
    percentiles: list = []
    for p in [75, 80, 85, 90, 95, 99,  100]:
        percentiles.append([p, np.percentile(lengths, p)])
    median: int = (percentiles[-1][1] + percentiles[-2][1]) // 2
    return int(median)

def create_dataset_from_tokenizer(tokenizer: tf.keras.preprocessing.text.Tokenizer, pad=int) -> tf.data.Dataset:
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=pad)
    padded_sequences = np.array(padded_sequences)
    labels_float = tf.strings.to_number(labels, out_type=tf.float32)
    dataset = tf.data.Dataset.from_tensor_slices((padded_sequences, labels_float))
    return dataset

# Now, let's use the above functions to create our consumable dataset: our AI cannot eat "strings" or "lists" but it can eat "tensors"!

To do so:
(1) We tokenize the sentences
(2) We pad the sentences
(3) We create a dataset from the tokenizer
(4) We shuffle the dataset (`dataset.shuffle`)
(5) We split the dataset into 3 parts: train, validation and test

In [None]:
# Data Splitting!

# TODO: (1) tokenize the sentences

# TODO: (2) pad the sentences

# TODO: (3) create a dataset from the tokenizer
# dataset = 
# TODO: (4) shuffle the dataset

# TODO: (5) split the dataset into 3 parts: train, validation and test
# -> train: 80%, validation: 10%, test: 10% for example

# train_dataset =
# val_dataset =
# test_dataset =

In [None]:
print(f'Train size: {len(list(train_dataset))}')
print(f'Validation size: {len(list(val_dataset))}')
print(f'Test size: {len(list(test_dataset))}')

# hyperparameters

We will use a batch size of 64 and 10 epochs.
Batch the datasets: `dataset.batch(batch_size)`

In [None]:
# TODO: Add the hyperparameters: batch_size and epochs

# batch the datasets
# train_dataset = 
# val_dataset = 
# test_dataset = 

In [None]:
print(f'Train size: {len(list(train_dataset))}')
print(f'Validation size: {len(list(val_dataset))}')
print(f'Test size: {len(list(test_dataset))}')

# Configuring the model

In [None]:
logs_dir = os.path.join(os.getcwd(), 'logs')
if not os.path.exists(logs_dir):
    os.mkdir(logs_dir)
data_dir = os.path.join(logs_dir, 'data')
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

# save the data for later use, callbacks
best_model_path = os.path.join(data_dir, 'best_model')
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_path, monitor='val_accuracy', save_best_only=True, save_weights_only=True)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)
callbacks = [checkpoint, early_stopping, tensorboard]

# Model

We will use a `Bidirectional LSTM` with `64` units and a `Dense` layer with `64` units.
Here is the model summary:
- Embedding: `tokenizer.word_index` + 1, `pad` (padding)
- Bidirectional LSTM: `pad`
- Dense: `64`, `relu`
- Dense: `1`, `sigmoid` (output)

We will use the `binary_crossentropy` loss function and the `adam` optimizer to compile the model.
You can observe the metrics: `accuracy`.

In [None]:
def create_model() -> tf.keras.models.Sequential:
    pass

In [None]:
model = create_model()
model.summary()

# Training

We will train the model on the train dataset and validate it on the validation dataset.
Don't forget to use the callbacks!
`history = model.fit(...)`

In [None]:
# history = 

# Evaluation

In [None]:
def evaluate_from_best():
    """
    :return: loss, accuracy
    """
    best_model_path = os.path.join(os.getcwd(), 'logs', 'data', 'best_model')
    model.load_weights(best_model_path)
    loss, accuracy = model.evaluate(test_dataset, batch_size=batch_size)
    print('Loss: {}, Accuracy: {}'.format(loss, accuracy))

evaluate_from_best()

In [None]:
# save plot in plots/ directory
plots_dir = os.path.join(os.getcwd(), 'plots')
if not os.path.exists(plots_dir):
    os.mkdir(plots_dir)
accuracy_plot_path = os.path.join(plots_dir, 'accuracy_plot.png')
loss_plot_path = os.path.join(plots_dir, 'loss_plot.png')

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.savefig(accuracy_plot_path)
plt.show()

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 1])
plt.legend(loc='upper right')
plt.savefig(loss_plot_path)
plt.show()

In [None]:
def predict(sentence: str) -> float:
    """
    :param sentence: sentence to predict
    :return: prediction
    """
    if isinstance(sentence, str):
        sequences = tokenizer.texts_to_sequences([sentence])
        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
        prediction = model.predict(padded_sequences)
        prediction = [1 if p > 0.5 else 0 for p in prediction]
        return prediction
    else:
        raise TypeError('Sentence must be a string')

In [None]:
# Positive
sentence = 'I love you'
prediction = predict(sentence)
print(f"Sentence: {sentence} \nPrediction: {':) UwU'  if prediction[0] == 1 else ':('}")

# Negative
sentence = 'I won\'t go there!'
prediction = predict(sentence)
print(f"Sentence: {sentence} \nPrediction: {':) UwU'  if prediction[0] == 1 else ':('}")