In [56]:
# Install required libraries (if running in Colab)
!pip install transformers datasets



In [57]:
# Optional: Mount Google Drive if your data is stored there
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
train_file = '/content/drive/MyDrive/gpt2_train.txt'

In [59]:
!cp /content/drive/MyDrive/gpt2_train.txt /tmp/gpt2_train.txt


In [60]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import os

model_name = 'gpt2'
train_file = "/tmp/gpt2_train.txt"
output_dir = "local_model_finetuned"

print(f"Using model: {model_name}")
print(f"Training data file: {train_file}")
print(f"Output directory: {os.path.abspath(output_dir)}")

# Load tokenizer and add padding token
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 has no pad token by default

# Load pre-trained GPT-2 model and resize token embeddings if tokenizer changed
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Load dataset manually from text file
with open(train_file, "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

dataset = Dataset.from_dict({"text": lines})

# Tokenize dataset with truncation and padding to max_length
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Prepare data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
    report_to="none",  # Disable reporting integrations like WandB
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
try:
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"✅ Model and tokenizer saved successfully to: {os.path.abspath(output_dir)}")
except Exception as e:
    print(f"❌ Failed to save model or tokenizer: {e}")


Using model: gpt2
Training data file: /tmp/gpt2_train.txt
Output directory: /content/local_model_finetuned


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss
100,2.2117
200,1.3134
300,1.1307


✅ Model and tokenizer saved successfully to: /content/local_model_finetuned


In [61]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load your fine-tuned model
model = GPT2LMHeadModel.from_pretrained("local_model_finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("local_model_finetuned")
model.eval()

# Prepare input
prompt = "Welcome to our smart"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
outputs = model.generate(
    **inputs,
    max_length=50,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    pad_token_id=tokenizer.eos_token_id,
)

# Print the generated text
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Welcome to our smart city with a global audience → withglobalaudience.net, witha.net, withmallaudience.org, cityaudience.net, cityaudienceplatform.net, awithaudience.com,


In [62]:
prompts = [
    "Discover the future of",
    "AI in healthcare will",
    "Top domain names for",
    "Smart homes and devices are"
]

for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_length=50,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )
    print(f"\nPrompt: {prompt}")
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))




Prompt: Discover the future of
Discover the future of luxury skincare line in a small town → smallinluxury.org, skincareintown.net, areadowntownluxury.net, inluxuryluxury.com, luxurysk

Prompt: AI in healthcare will
AI in healthcare will be a small town in a small town → healthmalltown.org, townsmall.net, townsmallin.net, withdowntowna.org, townsmall.org, healthmalltown.org,

Prompt: Top domain names for
Top domain names for: AI SaaS startup targeting remote workers → remoteappstartup.net, workersremotetargeting.org, workersremoteapp.net, saaasinetargeting.com, instartup.net, rem

Prompt: Smart homes and devices are
Smart homes and devices are a luxury luxury brand targeting remote workers in remote countries. If you are in remote, luxury workers are luxury workers with a global audience. luxuryworkersremote.net.com, remoteworkersremote.com, remot


In [64]:
import os
import shutil

# Compress the model directory
shutil.make_archive("local_model_finetuned", 'zip', "/content/local_model_finetuned")

# Get file size in bytes
file_size_bytes = os.path.getsize("local_model_finetuned.zip")

# Convert to MB
file_size_mb = file_size_bytes / (1024 * 1024)
print(f"📦 ZIP file size: {file_size_mb:.2f} MB")



📦 ZIP file size: 1756.13 MB


In [None]:
from google.colab import files
files.download("local_model_finetuned.zip")


In [65]:
import os

model_dir = "/content/local_model_finetuned"
unneeded_files = [
    "training_args.bin",
    "trainer_state.json",
    "optimizer.pt",
    "scheduler.pt",
]

for filename in unneeded_files:
    path = os.path.join(model_dir, filename)
    if os.path.exists(path):
        os.remove(path)
        print(f"✅ Removed: {filename}")
    else:
        print(f"ℹ️ Not found: {filename}")


ℹ️ Not found: training_args.bin
ℹ️ Not found: trainer_state.json
ℹ️ Not found: optimizer.pt
ℹ️ Not found: scheduler.pt


In [67]:
from google.colab import files
files.download("/content/local_model_finetuned.zip")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>