# **Step 1: Install Required Libraries:**

transformers, datasets, and torch are installed using !pip install to enable model fine-tuning.

In [1]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

# **Step 2: Data Preprocessing:**

The pandas library is used to read the CSV file and preprocess the data. The columns question and context are combined to form the input prompt for the T5 model, and the answer column serves as the target.

In [2]:
import pandas as pd
from datasets import Dataset

# Read dataset
df = pd.read_csv('/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/dataset.csv')

# Preprocess data by concatenating question and context as the input prompt
df['input'] = df['question'] + " " + df['context']

# Select only necessary columns: input and output
df = df[['input', 'answer']]

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)


# **Step 3: Load the Model and Tokenizer:**

The pre-trained T5 model and tokenizer (t5-small) are loaded from Hugging Face.

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
model_name = 't5-small'  # Or 't5-base', 't5-large' based on your computational resources
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Add padding token if not available
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# **Step 4: Tokenize the Dataset:**

The dataset is tokenized using the T5 tokenizer, where both inputs (question + context) and answers are tokenized. The input is padded/truncated to a max length of 512, while the output (answer) is truncated to a max length of 128.

In [4]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(examples['answer'], max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/78577 [00:00<?, ? examples/s]

# **Step 5: Split the Data into Training and Evaluation:**

The dataset is split into training and evaluation datasets using an 90-10% split for training and testing.

In [5]:
# Split dataset into train and validation sets
train_dataset = tokenized_datasets.train_test_split(test_size=0.1)['train']
eval_dataset = tokenized_datasets.train_test_split(test_size=0.1)['test']


# **Step 6: Fine-Tuning the Model:**

The Trainer class from the Hugging Face transformers library is used to fine-tune the T5 model using the tokenized data. The training arguments are set (e.g., batch size, number of epochs, learning rate), and the model is trained.

In [6]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate every epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
)

trainer = Trainer(
    model=model,                         # the model to train
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset            # evaluation dataset
)

# Start training
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33miguptabittu[0m ([33miguptabittu-linkedin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0602,0.042623
2,0.0531,0.034916


Epoch,Training Loss,Validation Loss
1,0.0602,0.042623
2,0.0531,0.034916
3,0.0484,0.033216


TrainOutput(global_step=26520, training_loss=0.06957207922212678, metrics={'train_runtime': 9591.8327, 'train_samples_per_second': 22.119, 'train_steps_per_second': 2.765, 'total_flos': 2.8713710574895104e+16, 'train_loss': 0.06957207922212678, 'epoch': 3.0})

TrainOutput(global_step=26520, training_loss=0.06957207922212678, metrics={'train_runtime': 9591.8327, 'train_samples_per_second': 22.119, 'train_steps_per_second': 2.765, 'total_flos': 2.8713710574895104e+16, 'train_loss': 0.06957207922212678, 'epoch': 3.0})

# **Step 7: Save the Fine-Tuned Model:**

Once training is complete, the fine-tuned model and tokenizer are saved to the specified directory for later use.

In [7]:
model.save_pretrained('/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5')
tokenizer.save_pretrained('/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5')

('/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/tokenizer_config.json',
 '/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/special_tokens_map.json',
 '/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/spiece.model',
 '/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/added_tokens.json')

('/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/tokenizer_config.json',
 '/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/special_tokens_map.json',
 '/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/spiece.model',
 '/content/drive/MyDrive/NLP PROJECTS/Sentiment Analysis Dataset/fine_tuned_t5/added_tokens.json')

# **Step 8: Model Evaluation / Inference:**

A generate_answer function is defined to allow inference by passing in a test question and context. The model generates SQL queries based on the input and context.

In [11]:
def generate_answer(question, context):
    input_text = question + " " + context
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)

    # Move inputs to the same device as the model
    device = next(model.parameters()).device  # Get model's device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(input_ids=inputs['input_ids'], max_length=128, num_beams=4, early_stopping=True)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# **Testing the Model:**


In [15]:
test_question = "How many heads of the departments are older than 50?"
test_context = "CREATE TABLE head (age INTEGER)"
print(generate_answer(test_question, test_context))

<pad> SELECT COUNT(*) FROM head WHERE age  50


In [16]:
test_question = "What is the maximum budget of the departments?"
test_context = "CREATE TABLE department (budget_in_billions INTEGER)"
print(generate_answer(test_question, test_context))


<pad> SELECT MAX(budget_in_billions) FROM department


In [17]:
test_question = "List the names and ages of all heads of departments."
test_context = "CREATE TABLE head (name VARCHAR, age INTEGER)"
print(generate_answer(test_question, test_context))


<pad> SELECT name, age FROM head GROUP BY age


In [20]:
test_question = "What is the average salary of employees?"
test_context = "CREATE TABLE employee (salary INTEGER)"
print(generate_answer(test_question, test_context))


<pad> SELECT AVG(salary) FROM employee


In [21]:
test_question = "Select the departments where the number of employees is greater than 100."
test_context = "CREATE TABLE department (num_employees INTEGER)"
print(generate_answer(test_question, test_context))


<pad> SELECT COUNT(*) FROM department WHERE number_employees > 100


In [22]:
test_question = "How many employees have a salary above 50000?"
test_context = "CREATE TABLE employee (salary INTEGER)"
print(generate_answer(test_question, test_context))


<pad> SELECT COUNT(*) FROM employee WHERE salary > 50000


In [23]:
test_question = "What is the maximum and minimum age of heads in each department?"
test_context = "CREATE TABLE head (department VARCHAR, age INTEGER)"
print(generate_answer(test_question, test_context))


<pad> SELECT department FROM head GROUP BY department
