<a href="https://colab.research.google.com/github/hnnayy/DeepLearning/blob/main/week7%20/WMT14/Tensorflow_WMT14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from datasets import load_dataset
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, Bidirectional, Embedding, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import seaborn as sns
import io
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
import pandas as pd
import keras_tuner as kt

# Download necessary nltk resources
nltk.download('punkt')

MAX_LEN = 50
BATCH_SIZE = 64

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Step 2: Define the data preprocessing function (assuming data is loaded in de_val, en_val, de_tokenizer, en_tokenizer)
def load_and_preprocess_data():
    print("Memuat dataset...")
    dataset = load_dataset("wmt14", "de-en")

    # Mengambil 5% data dari train dan validation set
    train_dataset = dataset['train'].select(range(int(len(dataset['train']) * 0.05)))
    val_dataset = dataset['validation'].select(range(int(len(dataset['validation']) * 0.05)))

    # Preprocessing: Tokenisasi dan Padding
    def tokenize_and_pad(texts, tokenizer=None, max_len=MAX_LEN):
        if tokenizer is None:
            tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
            tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)
        padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
        return padded_sequences, tokenizer

    # Prepare the data
    de_sentences = [example['translation']['de'] for example in train_dataset]
    en_sentences = [example['translation']['en'] for example in train_dataset]

    # Tokenization
    de_train, de_tokenizer = tokenize_and_pad(de_sentences, max_len=MAX_LEN)
    en_train, en_tokenizer = tokenize_and_pad(en_sentences, max_len=MAX_LEN)

    # Start token ID for decoder input
    start_token_id = en_tokenizer.word_index.get('<start>', 1)
    en_train_shifted = generate_decoder_input(en_train, start_token_id)

    # Validation data
    de_val_sentences = [example['translation']['de'] for example in val_dataset]
    en_val_sentences = [example['translation']['en'] for example in val_dataset]

    de_val, _ = tokenize_and_pad(de_val_sentences, tokenizer=de_tokenizer, max_len=MAX_LEN)
    en_val, _ = tokenize_and_pad(en_val_sentences, tokenizer=en_tokenizer, max_len=MAX_LEN)
    en_val_shifted = generate_decoder_input(en_val, start_token_id)

    return de_train, en_train, de_tokenizer, en_tokenizer, en_train_shifted, de_val, en_val, en_val_shifted, train_dataset, val_dataset


In [8]:
# Function to generate decoder input by shifting target data (English sentences)
def generate_decoder_input(target_data, start_token_id):
    decoder_input = np.zeros_like(target_data)  # Create a zero matrix with the same shape as target_data
    decoder_input[:, 1:] = target_data[:, :-1]  # Shift the target data to the right
    decoder_input[:, 0] = start_token_id  # Set the first token as the start token
    return decoder_input

In [9]:
# Step 3: Build the model
def build_model(embedding_dim=256, hidden_units=512, dropout_rate=0.2, cell_type='lstm', bidirectional=False, optimizer='adam', learning_rate=0.001):
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(input_dim=len(de_tokenizer.word_index)+1, output_dim=embedding_dim)(encoder_inputs)

    if bidirectional:
        if cell_type == 'lstm':
            encoder_rnn = Bidirectional(LSTM(hidden_units, return_state=True))
            encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_rnn(encoder_embedding)
            state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
            state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
            encoder_states = [state_h, state_c]
        else:  # GRU
            encoder_rnn = Bidirectional(GRU(hidden_units, return_state=True))
            encoder_outputs, forward_h, backward_h = encoder_rnn(encoder_embedding)
            state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
            encoder_states = [state_h]
    else:
        if cell_type == 'lstm':
            encoder_rnn = LSTM(hidden_units, return_state=True)
            encoder_outputs, state_h, state_c = encoder_rnn(encoder_embedding)
            encoder_states = [state_h, state_c]
        else:  # GRU
            encoder_rnn = GRU(hidden_units, return_state=True)
            encoder_outputs, state_h = encoder_rnn(encoder_embedding)
            encoder_states = [state_h]

    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(input_dim=len(en_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_inputs)
    decoder_embedding = Dropout(dropout_rate)(decoder_embedding)

    hidden_units_decoder = hidden_units * 2 if bidirectional else hidden_units
    if cell_type == 'lstm':
        decoder_rnn = LSTM(hidden_units_decoder, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)
    else:  # GRU
        decoder_rnn = GRU(hidden_units_decoder, return_sequences=True, return_state=True)
        decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)

    decoder_outputs = Dropout(dropout_rate)(decoder_outputs)
    decoder_dense = Dense(len(en_tokenizer.word_index) + 1, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    else:
        opt = optimizer

    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [10]:
# 3. **Training the Model**
def train_model(model, de_train, en_train_shifted, en_train, de_val, en_val_shifted, en_val):
    callbacks = [
        ModelCheckpoint("model_checkpoint.weights.h5", monitor='val_loss', save_best_only=True, save_weights_only=True),
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)
    ]

    history = model.fit(
        [de_train, en_train_shifted],
        np.expand_dims(en_train, -1),
        validation_data=([de_val, en_val_shifted], np.expand_dims(en_val, -1)),
        batch_size=BATCH_SIZE,
        epochs=1,
        callbacks=callbacks
    )

    return history


In [11]:
def evaluate_classification_metrics(model, de_val, en_val_shifted, en_val):
    # Make predictions
    predictions = model.predict([de_val, en_val_shifted], batch_size=BATCH_SIZE)

    # Flatten y_true and y_pred for evaluation
    y_true = en_val.flatten()  # True labels
    y_pred = np.argmax(predictions.reshape(-1, predictions.shape[-1]), axis=1)  # Predicted labels

    # Check if the length of predictions and true labels match
    print(f"Predictions shape: {y_pred.shape}, True labels shape: {y_true.shape}")

    # Remove padding (0) from both true labels and predictions (Assumes padding is 0)
    mask = y_true > 0  # Masking the padding
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    # Print the shapes of y_true and y_pred after masking
    print(f"Shape of y_true after masking: {y_true.shape}")
    print(f"Shape of y_pred after masking: {y_pred.shape}")

    # Check for length mismatch after masking
    if len(y_true) != len(y_pred):
        print(f"Warning: Mismatch in length of true labels and predicted labels. y_true: {len(y_true)}, y_pred: {len(y_pred)}")

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    # ROC AUC - Ensure that predictions are reshaped properly
    try:
        auc = roc_auc_score(y_true, predictions.reshape(-1, predictions.shape[-1]), multi_class='ovr', average='weighted')
    except ValueError:
        auc = 0.0  # Handle case where AUC cannot be computed

    # Calculate ROC curve values for visualization
    fpr, tpr, _ = roc_curve(y_true, predictions.reshape(-1, predictions.shape[-1]), pos_label=1)



In [12]:
def plot_metrics(accuracy, precision, recall, f1, auc, fpr, tpr):
    # Plot Evaluation Metrics
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
    values = [accuracy, precision, recall, f1, auc]

    plt.figure(figsize=(10, 6))
    sns.barplot(x=metrics, y=values)
    plt.title("Evaluation Metrics")
    plt.ylim(0, 1)
    plt.show()

    # ROC Curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='blue', label="ROC curve (area = %0.2f)" % auc)
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc='lower right')
    plt.show()


In [13]:
# 5. **Hyperparameter Tuning**
def run_hyperparameter_tuning(de_train, en_train_shifted, en_train, de_val, en_val_shifted, en_val):
    hypermodel = NMTHyperModel()

    tuner = kt.RandomSearch(
        hypermodel,
        objective='val_loss',
        max_trials=5,  # Try a smaller number of trials for testing
        directory='nmt_tuning',
        project_name='de_en_translation'
    )

    tuner.search(
        [de_train, en_train_shifted],
        np.expand_dims(en_train, -1),
        validation_data=([de_val, en_val_shifted], np.expand_dims(en_val, -1)),
        epochs=5
    )

    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

    best_model = build_model(
        embedding_dim=best_hps.get('embedding_dim'),
        hidden_units=best_hps.get('hidden_units'),
        dropout_rate=best_hps.get('dropout_rate'),
        cell_type=best_hps.get('cell_type'),
        bidirectional=best_hps.get('bidirectional'),
        learning_rate=best_hps.get('learning_rate')
    )

    return best_model, best_hps

In [14]:
# Correct the unpacking of values to match the 9 returned values
de_train, en_train, de_tokenizer, en_tokenizer, en_train_shifted, de_val, en_val, en_val_shifted, train_dataset, val_dataset = load_and_preprocess_data()
print("Data preprocessing completed.")


Memuat dataset...
Data preprocessing completed.


In [15]:
model = build_model()


In [16]:
history = train_model(model, de_train, en_train_shifted, en_train, de_val, en_val_shifted, en_val)


[1m3523/3523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1248s[0m 353ms/step - accuracy: 0.5682 - loss: 3.1027 - val_accuracy: 0.7192 - val_loss: 2.0719 - learning_rate: 0.0010


In [24]:
def evaluate_classification_metrics(model, de_val, en_val_shifted, en_val):
    # Make predictions
    predictions = model.predict([de_val, en_val_shifted], batch_size=BATCH_SIZE)

    # Flatten y_true and y_pred for evaluation
    y_true = en_val.flatten()  # True labels
    y_pred = np.argmax(predictions.reshape(-1, predictions.shape[-1]), axis=1)  # Predicted labels

    # Remove padding (0) from both true labels and predictions (Assumes padding is 0)
    mask = y_true > 0  # Masking the padding (only keep valid tokens)
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    try:
        auc = roc_auc_score(y_true, predictions.reshape(-1, predictions.shape[-1]), multi_class='ovr', average='weighted')
    except ValueError:
        auc = 0.0  # Handle case where AUC cannot be computed

    # Print only the metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")

    return accuracy, precision, recall, f1, auc


In [27]:
# Define the function to calculate and print metrics
def evaluate_classification_metrics(model, de_val, en_val_shifted, en_val):
    # Make predictions
    predictions = model.predict([de_val, en_val_shifted], batch_size=BATCH_SIZE)

    # Flatten y_true and y_pred for evaluation
    y_true = en_val.flatten()  # True labels
    y_pred = np.argmax(predictions.reshape(-1, predictions.shape[-1]), axis=1)  # Predicted labels

    # Remove padding (0) from both true labels and predictions (Assumes padding is 0)
    mask = y_true > 0  # Masking the padding (only keep valid tokens)
    y_true = y_true[mask]
    y_pred = y_pred[mask]

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate precision
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)

    # Calculate recall
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)

    # Calculate F1 score
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    # ROC AUC - Ensure that predictions are reshaped properly
    try:
        auc = roc_auc_score(y_true, predictions.reshape(-1, predictions.shape[-1]), multi_class='ovr', average='weighted')
    except ValueError:
        auc = 0.0  # Handle case where AUC cannot be computed

    # Print the metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")

    # Return all metrics
    return accuracy, precision, recall, f1, auc


In [30]:
# Now call the function and unpack all 5 returned values
accuracy, precision, recall, f1, auc = evaluate_classification_metrics(model, de_val, en_val_shifted, en_val)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 572ms/step
Accuracy: 0.2042
Precision: 0.1274
Recall: 0.2042
F1 Score: 0.1364
AUC: 0.0000


# Cara Kerja Kode

Berikut adalah penjelasan tentang bagaimana kode ini bekerja, termasuk dataset yang digunakan dan proses-proses penting di dalamnya:

## 1. Dataset

* **Dataset yang digunakan**: Dataset **WMT14 (German-English)** dari HuggingFace, yang digunakan untuk tugas **machine translation** (penerjemahan bahasa) dari bahasa Jerman (German) ke bahasa Inggris (English). Dataset ini berisi pasangan kalimat dalam dua bahasa tersebut.
* Dataset ini dibagi menjadi dua bagian utama: **train** dan **validation**. Kita menggunakan **5% data** dari kedua bagian ini untuk pelatihan dan validasi.

## 2. Data Preprocessing

* **Tokenisasi dan Padding**: Pada tahap ini, kalimat dalam bahasa Jerman (de) dan bahasa Inggris (en) diproses dengan **tokenisasi** menggunakan `Tokenizer` dari Keras, yang mengubah kalimat menjadi urutan angka berdasarkan kata yang ada.
* **Padding**: Tokenisasi mengubah kalimat menjadi urutan angka, tetapi panjang kalimat dapat bervariasi. Untuk menyamakan panjangnya, **padding** diterapkan agar setiap urutan memiliki panjang yang konsisten.
* **Generator Decoder Input**: Pada model seq2seq (sequence-to-sequence), perlu ada input khusus untuk decoder, yang disiapkan dengan **menggeser** urutan target (dalam hal ini kalimat dalam bahasa Inggris) sehingga model dapat memprediksi kata berikutnya dalam kalimat target.

## 3. Model Building (Seq2Seq Model)

* **Encoder-Decoder Architecture**: Model ini menggunakan arsitektur **Encoder-Decoder** yang terdiri dari dua bagian:
   * **Encoder**: Mengambil urutan input (kalimat dalam bahasa Jerman) dan mengubahnya menjadi representasi vektor yang mengandung informasi penting.
   * **Decoder**: Menggunakan representasi vektor dari encoder untuk menghasilkan urutan output (kalimat dalam bahasa Inggris).
* **Bidirectional LSTM**: LSTM (Long Short-Term Memory) digunakan untuk memproses urutan data, dan model ini menggunakan **Bidirectional LSTM** yang memungkinkan pemodelan informasi dari kedua arah (kiri ke kanan dan kanan ke kiri).

## 4. Training

* **Training** dilakukan dengan menggunakan **data pelatihan (train data)** yang telah diproses sebelumnya dan **data validasi (validation data)** untuk memantau performa selama pelatihan.
* **Early Stopping** dan **Model Checkpoint** digunakan untuk menghindari overfitting dan menyimpan model terbaik selama pelatihan.

## 5. Evaluation

Setelah model dilatih, **metrik evaluasi** dihitung menggunakan data validasi:
* **Accuracy**: Mengukur berapa banyak prediksi yang benar dibandingkan dengan total prediksi.
* **Precision**: Mengukur ketepatan prediksi positif (berkaitan dengan berapa banyak prediksi positif yang benar).

$$\text{Precision} = \frac{TP}{TP + FP}$$

* **Recall**: Mengukur sejauh mana model dapat menangkap semua contoh positif.

$$\text{Recall} = \frac{TP}{TP + FN}$$

* **F1-Score**: Rata-rata harmonis dari Precision dan Recall, digunakan untuk menilai performa model secara keseluruhan.

$$\text{F1-Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}$$

* **AUC (Area Under the Curve)**: Ukuran kinerja model klasifikasi. AUC yang lebih tinggi menunjukkan bahwa model lebih baik dalam membedakan antara kelas positif dan negatif.

## 6. Output Evaluasi

Hasil dari metrik-metrik tersebut adalah sebagai berikut:
* **Accuracy**: 0.2042
* **Precision**: 0.1274
* **Recall**: 0.2042
* **F1 Score**: 0.1364
* **AUC**: 0.0000

## Kesimpulan

* **Model yang digunakan saat ini menunjukkan hasil evaluasi yang sangat rendah**:
   * **Accuracy**: 20.42% – Model hanya benar dalam 1 dari 5 prediksi.
   * **Precision**: 12.74% – Model sering memprediksi positif secara keliru.
   * **Recall**: 20.42% – Model gagal menangkap sebagian besar prediksi positif yang benar.
   * **F1 Score**: 13.64% – Rata-rata harmonis antara precision dan recall yang rendah.
   * **AUC**: 0.0000 – Model tidak dapat membedakan antara kelas positif dan negatif.