In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab

Mounted at /content/drive
/content/drive/MyDrive/Colab


In [None]:
!pip install datasets
!pip install transformers
!pip install tensorflow
!pip install tensorflow-addons

In [None]:
import argparse
import datasets
import pandas
import transformers
import tensorflow as tf
import numpy as np
import tensorflow_addons as tfa
from transformers import TFDistilBertModel


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(examples):
    """Converts the text of each example to "input_ids", a sequence of integers
    representing 1-hot vectors for each token in the text"""
    return tokenizer(examples["text"], truncation=True, max_length=64,
                     padding="max_length", return_tensors="tf")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def create_model():
    # Load the DistilBERT model
    distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

    # Define the input layer
    input_ids = tf.keras.layers.Input(shape=(64,), dtype=tf.int32, name='input_ids')

    # DistilBERT call without the attention_mask
    outputs = distilbert.distilbert(input_ids)
    last_hidden_state = outputs.last_hidden_state

    # Use the CLS token for classification tasks (first token)
    cls_token = last_hidden_state[:, 0, :]

    # Define the output layer for emotion classification
    emotion_classifier = tf.keras.layers.Dense(7, activation='sigmoid')(cls_token)

    # Define the model
    model = tf.keras.Model(inputs=input_ids, outputs=emotion_classifier)

    # Compile the model
    optimizer = tf.keras.optimizers.AdamW(learning_rate=5e-6, weight_decay=0.01)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[tfa.metrics.F1Score(num_classes=7, average='micro', threshold=0.5)])

    return model

In [None]:
def train(model_path="model", train_path="train.csv", dev_path="dev.csv"):
    # load the CSVs into Huggingface datasets to allow use of the tokenizer
    hf_dataset = datasets.load_dataset("csv", data_files={
        "train": train_path, "validation": dev_path})

    # the labels are the names of all columns except the first
    labels = hf_dataset["train"].column_names[1:]

    def gather_labels(example):
        """Converts the label columns into a list of 0s and 1s"""
        # the float here is because F1Score requires floats
        return {"labels": [float(example[l]) for l in labels]}

    # convert text and labels to format expected by model
    hf_dataset = hf_dataset.map(gather_labels)
    hf_dataset = hf_dataset.map(tokenize, batched=True)

    # TODO: Define the model
    model = create_model()  # Ensure create_model() is updated for DistilBERT

    # TODO: Prepare the data for training
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': hf_dataset['train']['input_ids'],
        },
        hf_dataset['train']['labels']
    )).shuffle(100).batch(16)

    validation_dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': hf_dataset['validation']['input_ids'],
        },
        hf_dataset['validation']['labels']
    )).batch(16)

    # TODO: Train the model
    model.fit(train_dataset, validation_data=validation_dataset, epochs=5,
              callbacks=[
            tf.keras.callbacks.ModelCheckpoint(
                filepath=model_path,
                monitor="val_f1_score",
                mode="max",
                save_best_only=True)
            ])

    # TODO: Save the model
    model.save(model_path)


In [None]:
def predict(model_path="model", input_path="dev.csv"):
    # Load the saved model
    model = tf.keras.models.load_model(model_path)

    # Load the data for prediction
    df = pandas.read_csv(input_path)

    # Create input features in the same way as in train()
    hf_dataset = datasets.Dataset.from_pandas(df)
    hf_dataset = hf_dataset.map(tokenize, batched=True)

    labels = ['admiration', 'amusement', 'gratitude', 'love', 'pride', 'relief', 'remorse']

    # TODO: Prepare features for prediction
    predict_features = {
        'input_ids': np.array(hf_dataset['input_ids']),
    }

    # TODO: Run predictions
    predictions = model.predict(predict_features)

    # TODO: Process the predictions to generate a human-readable output
    predicted_labels = (predictions > 0.5).astype(int)

    # Add the predicted labels to the dataframe for easier interpretation
    for i, label in enumerate(labels):
        df[label] = predicted_labels[:, i]

    # Save the dataframe to a compressed CSV inside a ZIP file
    df.to_csv("submission.zip", index=False, compression=dict(
        method='zip', archive_name='submission.csv'))

In [None]:
train()

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Map:   0%|          | 0/25196 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/5



Epoch 2/5



Epoch 3/5



Epoch 4/5



Epoch 5/5




In [None]:
predict()

In [None]:
#!rm -rf model

In [None]:
from google.colab import runtime
runtime.unassign()