## FIle upload

This is to upload `train.csv`. Instead of downloading from Kaggle (see below), Zenodo was used, https://zenodo.org/records/6462718.

The Kaggle API, `kaggle.json`, is uploaded similarly.

```python
import os
# Create the .kaggle directory if it doesn't exist
os.makedirs('/root/.kaggle', exist_ok=True)
os.rename('kaggle.json', '/root/.kaggle/kaggle.json')
```

`chmod 600 /root/.kaggle/kaggle.json`

The Kaggle download is replaced with Zenodo as noted.

`kaggle competitions download -c quora-insincere-questions-classification`

In [None]:
from google.colab import files
files.upload()

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("train.csv", nrows=500)

In [2]:
!pip install kaggle



In [3]:
! pip install transformers accelerate bitsandbytes peft datasets



In [4]:
from datasets import Dataset
train_ds = Dataset.from_pandas(data[:400])
test_ds = Dataset.from_pandas(data[400:])

In [7]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

As of PEFT version 0.10.0, the function prepare_model_for_int8_training has been deprecated and is no longer available. To prepare models for INT8 training, you should now use the function prepare_model_for_kbit_training instead. Note also evaluation_strategy is changed to eval_strategy.

In [None]:
from transformers import AutoTokenizer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

# Load the pre-trained model and tokenizer
model_id = "google/flan-t5-xxl"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
loraconfig = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['q', 'v'],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, loraconfig)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["question_text"], padding="max_length", truncation=True)

# Define maximum sequence length
max_length = 128

# Preprocessing function
def preprocess_function(examples):
    inputs = examples["question_text"]
    targets = [str(x) for x in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="np")
    labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="np")
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100
    return model_inputs

# Preprocess the datasets
train_dataset = train_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=train_ds.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on train dataset",
)

test_dataset = test_ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=test_ds.column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on test dataset",
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=3,
    label_names=["target"],  # Specify the label key(s)
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Disable cache to save memory during training
model.config.use_cache = False

# Start training
trainer.train()


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]



Running tokenizer on train dataset:   0%|          | 0/400 [00:00<?, ? examples/s]

Running tokenizer on test dataset:   0%|          | 0/100 [00:00<?, ? examples/s]