<a href="https://colab.research.google.com/github/laxmiharikumar/transformers/blob/main/AIorNOT_HF_Competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Install required packages
!pip install --upgrade huggingface_hub --quiet
!pip install --upgrade datasets --quiet
!pip install --upgrade transformers --quiet

In [None]:
from huggingface_hub import login
from huggingface_hub import hf_hub_download

In [None]:
login()

## Load the data

- Load the training and testing data sets

In [None]:
from datasets import load_dataset

ds = load_dataset('competitions/aiornot')
ds

In [None]:
train_ds = ds["train"]
test_dataset = ds["test"]

In [None]:
## Train data has 0 and 1 labels
import numpy as np
unique, counts = np.unique(train_ds["label"], return_counts=True)
unique, counts

In [None]:
## Test data all labels are set to -1
unique, counts = np.unique(test_dataset["label"], return_counts=True)
unique, counts

## Inspect the Data
- Randomly inspect the images

In [None]:
import random
import io
import matplotlib.pyplot as plt

In [None]:
# def visualize_random_images(tmp_ds1, tmp_ds2, m):
def visualize_random_images(tmp_ds):
  random_number = random.randint(0, len(tmp_ds)-1)
 
  # plt.figure(figsize=(10,7))
  plt.imshow(tmp_ds[random_number]["image"])
  plt.axis("off")
  label_val = "ai" if tmp_ds[random_number]["label"] == 1 else "no_ai" if tmp_ds[random_number]["label"] == 0 else "unknown"
  plt.title(str(random_number) + " " + label_val)
  plt.show()

  # # plt.figure(figsize=(10,7))
  # plt.imshow(tmp_ds2[random_number][m])
  # plt.axis("off")
  # label_val = "ai" if tmp_ds2[random_number]["label"] == 1 else "no_ai"
  # plt.title(str(random_number) + " " + label_val)
  # plt.show()

In [None]:
# Randomly inspect images from training dataset
visualize_random_images(train_ds)

In [None]:
# Randomly inspect images from test dataset
visualize_random_images(test_dataset)

In [None]:
## Split the training data into train and test sets for model building
split_ds = train_ds.train_test_split(seed=42, shuffle=True, test_size=0.1)
split_ds

In [None]:
train_dataset = split_ds["train"]
val_dataset = split_ds["test"]

In [None]:
unique, counts = np.unique(split_ds["train"]["label"], return_counts=True)
unique, counts

In [None]:
unique, counts = np.unique(split_ds["test"]["label"], return_counts=True)
unique, counts

## Preprocess the data

- Normalize the data
- Apply data augmentation

In [None]:
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)
image_processor.size["height"], image_processor.size["width"] ## We should resize the images to this size

In [None]:
import tensorflow as tf

size = (image_processor.size["height"], image_processor.size["width"])

train_data_augmentation = tf.keras.Sequential(
    [
        tf.keras.layers.Resizing(size[0], size[1]),         
        tf.keras.layers.Rescaling(scale=1.0/255, offset=0),
        tf.keras.layers.RandomRotation(factor=0.2),
        tf.keras.layers.RandomZoom(height_factor=0.2, width_factor=0.2),
        tf.keras.layers.RandomFlip()       
    ],
    name="train_data_augmentation",
)


# For the validation data and test data apply only Resizing and Rescaling
val_data_augmentation = tf.keras.Sequential(
    [
        tf.keras.layers.Resizing(size[0], size[1]),        
        tf.keras.layers.Rescaling(scale=1.0/255, offset=0)
    ],
    name="val_data_augmentation",
)


In [None]:
import numpy as np
import tensorflow as tf
from PIL import Image


def convert_to_tf_tensor(image: Image):
    np_image = np.array(image)
    tf_image = tf.convert_to_tensor(np_image)
    # `expand_dims()` is used to add a batch dimension since
    # the TF augmentation layers operates on batched inputs.
    return tf.expand_dims(tf_image, 0)


def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    images = [
        train_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
    ]
    example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
    return example_batch


def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    images = [
        val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
    ]
    example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]  
    return example_batch

In [None]:
## Add a new column "pixel_values"
new_column = ["pixel_values"] * len(train_dataset)
train_dataset = train_dataset.add_column("pixel_values", new_column)
new_column = ["pixel_values"] * len(val_dataset)
val_dataset = val_dataset.add_column("pixel_values", new_column)
new_column = ["pixel_values"] * len(test_dataset)
test_dataset = test_dataset.add_column("pixel_values", new_column)

In [None]:
## Run data augmentations on training set, validation set and test set
train_dataset.set_transform(preprocess_train)
val_dataset.set_transform(preprocess_val)
test_dataset.set_transform(preprocess_val)

In [None]:
train_dataset[0]["pixel_values"].shape, val_dataset[0]["pixel_values"].shape, test_dataset[0]["pixel_values"].shape

In [None]:
train_dataset

In [None]:
# As a final preprocessing step, create a batch of examples using DefaultDataCollator. 
# Unlike other data collators in 🤗 Transformers, the DefaultDataCollator does not apply additional preprocessing, such as padding.
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

## Training the model

To fine-tune a model in TensorFlow, follow these steps:

- Define the training hyperparameters, and set up an optimizer and a learning rate schedule.
- Instantiate a pre-treined model.
- Convert a 🤗 Dataset to a tf.data.Dataset.
- Compile your model.
- Add callbacks and use the fit() method to run the training.
- Upload your model to 🤗 Hub to share with the community.

In [None]:
## Step 1 - Define the training hyperparameters, and set up an optimizer and a learning rate schedule.
from transformers import create_optimizer

batch_size = 32
num_epochs = 3
num_train_steps = len(train_dataset) * num_epochs
learning_rate = 3e-5
weight_decay_rate = 0.01

optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=0,
)

In [None]:
## Step 2 - Instantiate a pre-trained model
from transformers import TFAutoModelForImageClassification

label2id = {"no_ai": 0, "ai": 1}
id2label = {0: "no_ai", 1: "ai"}

pre_trained_model = TFAutoModelForImageClassification.from_pretrained(
     checkpoint, 
     label2id=label2id,
     id2label=id2label,
     ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)
pre_trained_model.trainable = True

In [None]:
# train_dataset = train_dataset.remove_columns("image")
# val_dataset = val_dataset.remove_columns("image")
# test_dataset = test_dataset.remove_columns("image")
# train_dataset, val_dataset, 

In [None]:
## Add a Dense layer to the model
# inputs = tf.keras.layers.Input(shape=(3, size[0], size[1]))
# x = base_model(inputs)[0]
# # x =  tf.keras.layers.Flatten()(x)
# outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
# model = tf.keras.Model(inputs, outputs, name="my_laxs_first_model")

model_1 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(3, size[0], size[1])),
    pre_trained_model,
    tf.keras.layers.Dense(1, activation="sigmoid")
], name="model_1")

In [None]:
model_1.summary()

In [None]:
model_1.save("mmodel1")

In [None]:
## Step 3 - Convert a 🤗 Dataset to a tf.data.Dataset.
!pip install evaluate --quiet

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
from evaluate.module import Dataset
# Convert your datasets to the tf.data.Dataset format using the to_tf_dataset and your data_collator:
# converting our train dataset to tf.data.Dataset
# Already shuffled while splitting
tf_train_dataset = train_dataset.to_tf_dataset(
    columns=["pixel_values"], label_cols=["label"], shuffle=False, batch_size=batch_size, collate_fn=data_collator 
)

# converting our val dataset to tf.data.Dataset
tf_eval_dataset = val_dataset.to_tf_dataset(
    columns=["pixel_values"], label_cols=["label"], shuffle=False, batch_size=batch_size, collate_fn=data_collator 
)

# converting our test dataset to tf.data.Dataset
tf_test_dataset = test_dataset.to_tf_dataset(
    columns=["pixel_values"], label_cols=["label"], shuffle=False, batch_size=batch_size, collate_fn=data_collator
)

In [None]:
tf_train_dataset

In [None]:
## Step 4 - Compile the model
loss = tf.keras.losses.BinaryCrossentropy()
model_1.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

In [None]:
# Step 5 - Fit the model
history_1 = model_1.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs)

In [None]:
# Plot the validation and training data separately
def plot_loss_curves(history):
  """
  Returns separate loss curves for training and validation metrics.
  """ 
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  accuracy = history.history['accuracy']
  val_accuracy = history.history['val_accuracy']

  epochs = range(len(history.history['loss']))

  # Plot loss
  plt.plot(epochs, loss, label='training_loss')
  plt.plot(epochs, val_loss, label='val_loss')
  plt.title('Loss')
  plt.xlabel('Epochs')
  plt.legend()

  # Plot accuracy
  plt.figure()
  plt.plot(epochs, accuracy, label='training_accuracy')
  plt.plot(epochs, val_accuracy, label='val_accuracy')
  plt.title('Accuracy')
  plt.xlabel('Epochs')
  plt.legend();
     


In [None]:
# Plot the accuracy
plot_loss_curves(history_1)

In [None]:
model_1.evaluate(tf_eval_dataset)

In [None]:
model_pred_probs = model_1.predict(tf_eval_dataset)
model_pred_probs[:10]

In [None]:
model_preds = tf.round(tf.squeeze(model_pred_probs))
model_preds[:10]

In [None]:
sub_pred_probs = model_1.predict(tf_test_dataset)
sub_pred_probs[:10]

In [None]:
sub_preds = tf.round(tf.squeeze(sub_pred_probs))
sub_preds[:10]

In [None]:
len(sub_preds)

In [None]:
from sklearn.metrics import log_loss
log_loss(y_true, y_pred)

In [None]:
tf_test_dataset

In [None]:
sample_sub_file = hf_hub_download('competitions/aiornot', '.extras/sample_submission.csv', repo_type='dataset')

In [None]:
from google.colab import files
files.download(sample_sub_file)

In [None]:
import pandas as pd
df = pd.read_csv(sample_sub_file)
df.head()

In [None]:
test_dataset

In [None]:
new_column = ["foo"] * len(test_dataset)
p = test_dataset.add_column("predicted", sub_preds.numpy())
p

In [None]:
type(p)
# p=p.remove_columns("image")
# p=p.remove_columns("pixel_values")
# p=p.remove_columns("label")
# p
# # # , ""

In [None]:
import csv
with open('protagonist.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for i in range(43442):
      # m = p[i]["id"]
      writer.writerow([p[i]["id"], p[i]["predicted"]])

In [None]:
from google.colab import files
files.download('protagonist.csv')

In [None]:
!nvidia-smi