In [None]:
# Install required libraries (if running in Colab)
!pip install transformers datasets



In [None]:
# Optional: Mount Google Drive if your data is stored there
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_file = '/content/drive/MyDrive/gpt2_train.txt'

In [None]:
!cp /content/drive/MyDrive/gpt2_train.txt /tmp/gpt2_train.txt


In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import os

# Config
model_name = "gpt2"
train_file = "/tmp/gpt2_train.txt"  # your training text file path
output_dir = "local_model_finetuned"

# Load tokenizer and add pad token (GPT-2 has no pad token by default)
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # set pad token to eos token

# Load pretrained GPT-2 model
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))  # resize token embeddings if tokenizer changed

# Load dataset from text lines
with open(train_file, "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

dataset = Dataset.from_dict({"text": lines})

# Tokenize dataset (pad/truncate to max_length)
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

# Prepare data collator for causal language modeling (no MLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=100,
    report_to="none",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save model and tokenizer properly
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"✅ Model and tokenizer saved to {os.path.abspath(output_dir)}")

# -----------------
# Local generation test (sanity check)
from transformers import AutoTokenizer, GPT2LMHeadModel
import torch

print("\nStarting local generation test...")

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

prompt = "Welcome to our smart"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_length=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    pad_token_id=tokenizer.pad_token_id,
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss
100,2.2149
200,1.3194
300,1.151


✅ Model and tokenizer saved to /content/local_model_finetuned

Starting local generation test...
Generated text:
Welcome to our smart city for Gen Z customers → arealbeachings.net, arealcity.com, arealz.net, forzcustomers.com).

Follow us on Twitter for customers → customersforz.


In [43]:
from transformers import AutoTokenizer, GPT2LMHeadModel
import torch
import os

# Path where your fine-tuned model and tokenizer are saved locally
local_model_path = "local_model_finetuned"  # change to your actual folder

assert os.path.exists(local_model_path), f"Model folder not found: {local_model_path}"

# Load tokenizer and model locally
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = GPT2LMHeadModel.from_pretrained(local_model_path)

# Set pad token if not set (GPT-2 usually has no pad token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Prompt for testing
prompt = "Welcome to our smart"

# Tokenize input and move tensors to device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate output
outputs = model.generate(
    **inputs,
    max_length=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# Decode and print the generated text
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated text:")
print(result)


Generated text:
Welcome to our smart hotel for Gen Z customers → forzcustomers.org, customersfor.org, forgenz.net, arealotouch.org, sanforz.org, zcustomersz.org, zcustom


In [None]:
from huggingface_hub import login, create_repo, upload_folder
import os
import time

# 1. Authenticate - replace with your actual Hugging Face token
try:
    login(token="hf_token")
    print("✅ Authentication successful!")
except Exception as e:
    print(f"❌ Authentication failed: {e}")
    exit(1)

# 2. Define the repo ID and local folder path
repo_id = "Octopus87/domain-suggester-gpt2"  # Replace with your actual username
local_path = "local_model_finetuned"

# 3. Check if the local folder exists and has content
if not os.path.isdir(local_path):
    raise ValueError(f"Provided path: '{local_path}' is not a directory")

# Check if folder has model files
required_files = ["config.json"]
folder_contents = os.listdir(local_path)
print(f"📁 Local folder contents: {folder_contents}")

if not any(file in folder_contents for file in required_files):
    print("⚠️  Warning: No config.json found. Make sure you have valid model files.")

# 4. Create the repo on Hugging Face (will do nothing if it exists)
try:
    create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
    print(f"✅ Repository '{repo_id}' ready!")
except Exception as e:
    print(f"❌ Failed to create repository: {e}")
    exit(1)

# 5. Upload all files from the local folder to the repo root
try:
    print("🚀 Starting upload...")
    upload_folder(
      repo_id="Octopus87/domain-suggester-gpt2",
      folder_path=output_dir,
      path_in_repo=".",
      repo_type="model",
      commit_message="Upload fine-tuned model and tokenizer with vocab"
    )
    print("✅ Model uploaded successfully!")
    print(f"🔗 View your model at: https://huggingface.co/{repo_id}")

except Exception as e:
    print(f"❌ Upload failed: {e}")
    print("Common solutions:")
    print("- Check your internet connection")
    print("- Verify you have write permissions to the repository")
    print("- Ensure model files are valid")
    print("- Try uploading smaller batches if the model is very large")

✅ Authentication successful!
📁 Local folder contents: ['config.json', 'special_tokens_map.json', 'tokenizer.json', 'checkpoint-200', 'tokenizer_config.json', 'generation_config.json', 'checkpoint-300', 'vocab.json', 'model.safetensors', 'pytorch_model.bin', 'merges.txt']
✅ Repository 'Octopus87/domain-suggester-gpt2' ready!
🚀 Starting upload...


No files have been modified since last commit. Skipping to prevent empty commit.


✅ Model uploaded successfully!
🔗 View your model at: https://huggingface.co/Octopus87/domain-suggester-gpt2


In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel
import torch

model = GPT2LMHeadModel.from_pretrained("Octopus87/domain-suggester-gpt2", token="hf_token")
tokenizer = AutoTokenizer.from_pretrained("Octopus87/domain-suggester-gpt2", token="hf_token")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

prompt = "Welcome to our smart"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_length=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    pad_token_id=tokenizer.pad_token_id,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


tokenizer_config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

Welcome to our smart city for Gen Z customers with a global audience → zcustomerscustomersglobal.org, areaengroup.org, areaengroup.com, zcustomersglobal.com, zcustomerscustomersz


In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel
import torch

model = GPT2LMHeadModel.from_pretrained("Octopus87/domain-suggester-gpt2", token="hf_token")
tokenizer = AutoTokenizer.from_pretrained("Octopus87/domain-suggester-gpt2", token="hf_token")

# Set pad token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

prompt = "Welcome to our smart"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_length=50,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Welcome to our smart health service in downtown area → healthdowntownhealth.net, healthinarea.net, healthareahealth.net, healthowntownhealth.net, healthinarea.org, downtownhealthdowntown.com, healthare
