## Task
Fine-tune the GPT-2 model from Hugging Face using LoRA based on the data in "/content/train.jsonl" and "/content/miniGPT.txt".

## Install necessary libraries
### Subtask:
Install libraries like transformers and peft for model handling and LoRA.

In [None]:
%pip install transformers peft datasets gradio

## Load and preprocess the data
### Subtask:
Load the data from "/content/train.jsonl" and "/content/miniGPT.txt" and prepare it for training, which may involve tokenization and formatting.

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Load data from train.jsonl
df_jsonl = pd.read_json('/content/train.jsonl', lines=True)
# Combine prompt and response into text field
df_jsonl['text'] = df_jsonl['prompt'] + ' ' + df_jsonl['response']

# Load data from miniGPT.txt
with open('/content/miniGPT.txt', 'r') as f:
    text_data = [line.strip() for line in f if line.strip()]
df_txt = pd.DataFrame(text_data, columns=['text'])

# Combine the data
combined_df = pd.concat([df_jsonl[['text']], df_txt], ignore_index=True)

# Filter out empty text
combined_df = combined_df[combined_df['text'].str.len() > 0]

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(combined_df)

# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(f'Total examples: {len(tokenized_datasets)}')
print(tokenized_datasets[0])

In [None]:
# Create labels
tokenized_datasets = tokenized_datasets.map(lambda examples: {'labels': examples['input_ids']}, batched=True)

# Split the dataset into training and validation sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Print information about the datasets
print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(eval_dataset))

# Display the first example of the training dataset
print(train_dataset[0])

## Load the pre-trained gpt-2 model

### Subtask:
Load the GPT-2 model from Hugging Face.

In [None]:
from transformers import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

Configure lora
Subtask:
Set up the LoRA configuration for fine-tuning the GPT-2 model.

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32, # Scaling factor
    lora_dropout=0.1, # Dropout rate
    bias="none", # Bias type
    task_type="CAUSAL_LM", # Task type
)

## Prepare the model for lora
### Subtask:
Integrate the LoRA adapters into the GPT-2 model.

In [None]:
from peft import get_peft_model

peft_model = get_peft_model(model, lora_config)
print(peft_model.print_trainable_parameters())

## Define training arguments
### Subtask:
Set up the parameters for the training process, such as epochs, batch size, learning rate, etc.

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora_gpt2_finetuned",  # Output directory
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size for training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    learning_rate=2e-4,  # Learning rate
    weight_decay=0.01, # Weight decay
    logging_dir="./logs", # Logging directory
    logging_steps=10, # Log every 10 steps
)

## Create the trainer
### Subtask:
Instantiate the Hugging Face Trainer with the model, training arguments, and data.

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

## Train the model
### Subtask:
Start the fine-tuning process using the prepared data and configuration.

In [None]:
trainer.train()

## Save the fine-tuned model
### Subtask:
Save the fine-tuned model

In [None]:
trainer.save_model()

## Create the Gradio app
### Subtask:
Create a Python script that uses Gradio to build a simple UI for the fine-tuned model.

In [None]:
%%writefile gradio_app.py
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch # Import torch

# Load the fine-tuned model and tokenizer
def load_model(model_path, lora_path):
    base_model = AutoModelForCausalLM.from_pretrained(model_path)
    model = PeftModel.from_pretrained(base_model, lora_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    # Move model to GPU if available
    if torch.cuda.is_available():
        model = model.to('cuda')
    return model, tokenizer

model, tokenizer = load_model("openai-community/gpt2", "./lora_gpt2_finetuned")

# Define the prediction function
def generate_text(prompt):
    # Move inputs to GPU if available
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Create the Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=gr.Textbox(lines=5, label="Enter your prompt"),
    outputs=gr.Textbox(label="Generated text"),
    title="microGPT",
    description="Enter a prompt and microGPT will generate text."
)

# Launch the interface
iface.launch(share=True)

In [None]:
!python gradio_app.py