In [1]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the JSON data
with open('jacksparrow_modified.json', 'r') as f:
    data = json.load(f)

# Prepare the data
conversations = data['conversations']

# Separate the conversations into human and gpt inputs
human_inputs = []
gpt_outputs = []

for i in range(len(conversations) - 1):
    if conversations[i]['from'] == 'human' and conversations[i + 1]['from'] == 'gpt':
        human_inputs.append(conversations[i]['value'])
        gpt_outputs.append(conversations[i + 1]['value'])

# Create a dataset suitable for fine-tuning
train_data = [f"Human: {h}\nGPT: {g}" for h, g in zip(human_inputs, gpt_outputs)]
dataset = Dataset.from_dict({'text': train_data})

# Load the tokenizer and model
model_name = "SF-Foundation/TextBase-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Define training arguments with adjusted learning rate
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=300,
    fp16=True,  # Enable mixed precision training if supported by GPU
    learning_rate=5e-5  # Adjust the learning rate here
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("fine-tuned-textbase-7b")
tokenizer.save_pretrained("fine-tuned-textbase-7b")

# Load the fine-tuned model and tokenizer for inference
tokenizer = AutoTokenizer.from_pretrained("fine-tuned-textbase-7b")
model = AutoModelForCausalLM.from_pretrained("fine-tuned-textbase-7b")

# Function to generate a response
def generate_response(prompt, model, tokenizer, max_length=50, num_return_sequences=1):
    inputs = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,  # Use sampling for more varied responses
        top_k=50,  # Limit the sampling pool to top_k tokens
        top_p=0.95,  # Nucleus sampling
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example prompt
prompt = "Human: How are you today?\nGPT:"

# Generate a response
response = generate_response(prompt, model, tokenizer)
print(response)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [00:32<00:00,  4.07s/it]
Map: 100%|██████████| 316/316 [00:00<00:00, 727.71 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 