<a href="https://colab.research.google.com/github/karthikresi/PRODIGY_DA_01/blob/main/Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load sample data
data = {
    'query': [
        "My order hasn't arrived yet. Can you provide an update?",
        "I received a damaged product. How can I get a replacement?",
        "What are your return policies for online purchases?"
    ],
    'response': [
        "I apologize for the delay. Let me check the status of your order for you.",
        "I'm sorry to hear that. Please provide your order details, and we'll arrange a replacement.",
        "Our return policy allows returns within 30 days of purchase. Could you please provide your order number?"
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Split into train, validation, test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Save to CSV files
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('val_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load fine-tuned model and tokenizer from directory path
model_path = "./fine_tuned_gpt2"  # Ensure this path is correct and exists
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Example prompt for text generation
prompt = "What is your return policy?"

# Tokenize the prompt
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text
generated = model.generate(input_ids, max_length=100, num_return_sequences=1)

# Decode and print the generated text
decoded_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print("Generated Text:", decoded_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text: What is your return policy?

I'm not sure if I'll be able to return to the game. I'm not sure if I'll be able to play the game. I'm not sure if I'll be able to play the game.

What is your return policy?

I'm not sure if I'll be able to return to the game. I'm not sure if I'll be able to play the game. I'm not sure if I'll be able to


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Load sample data
data = {
    'query': [
        "My order hasn't arrived yet. Can you provide an update?",
        "I received a damaged product. How can I get a replacement?",
        "What are your return policies for online purchases?"
    ],
    'response': [
        "I apologize for the delay. Let me check the status of your order for you.",
        "I'm sorry to hear that. Please provide your order details, and we'll arrange a replacement.",
        "Our return policy allows returns within 30 days of purchase. Could you please provide your order number?"
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Split into train, validation, test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Save to CSV files
train_df.to_csv('train_dataset.csv', index=False)
val_df.to_csv('val_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)

# Load datasets
train_df = pd.read_csv('train_dataset.csv')
val_df = pd.read_csv('val_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset_hf = Dataset.from_pandas(train_df)
val_dataset_hf = Dataset.from_pandas(val_df)
test_dataset_hf = Dataset.from_pandas(test_df)

# Initialize the tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Optional: Add a special token for padding if not already in tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Tokenize the data
def tokenize_function(examples):
    # Tokenize each query and response individually
    query_encodings = tokenizer(examples['query'], max_length=512, padding="max_length", truncation=True)
    response_encodings = tokenizer(examples['response'], max_length=512, padding="max_length", truncation=True)

    # Combine query and response encodings to create input and label pairs
    input_ids = [q + r for q, r in zip(query_encodings['input_ids'], response_encodings['input_ids'])]
    attention_mask = [q + r for q, r in zip(query_encodings['attention_mask'], response_encodings['attention_mask'])]
    labels = input_ids.copy()  # Use input_ids as labels for language modeling

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_encodings = train_dataset_hf.map(tokenize_function, batched=True)
val_encodings = val_dataset_hf.map(tokenize_function, batched=True)
test_encodings = test_dataset_hf.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=val_encodings,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,12.690575
2,No log,12.378916
3,No log,12.272951


('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')