In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
file_path = "Dataset.csv"  # Replace with your CSV file path
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Business Name,Domain,User input Sinhala-in Sinhala letters,User input English-in English letters,Is there any missing information(NO/Yes),Missing Information,Existing Business or Newly Built Business,System output,Unnamed: 9
0,1,Green Grow Farms,Agriculture,අපේ කෘෂිකාර්මික ක්‍රියාමාර්ග සඳහාම වේ. ස්වභාවි...,Our agricultural activities focus on using nat...,No,,Newly Built Business,Executive Summary: Green Grow Farms is a newly...,
1,2,Holiday Mansion,Tourism and Hospitality,"\nHoliday Mansion, kandy පිහිටි වැනි අලංකාර\n ...",I want to make a business proposal for Holiday...,No,,Newly Built Business,"\nExecutive Summary: \nHoliday Mansion, a luxu...",
2,3,ABC Textiles,Manufacturing and Exporting,ආරක්ෂිත ලෙස ABC Textiles සාර්ථකව වෙළඳපොළ\n තුළ...,\nABC Textiles is dedicated to enhancing its m...,Yes,"Financial projections, \ncompany background,\n...",Existing Business,\nExecutive Summary\nABC Manufacturing is a le...,
3,4,PrecisionTools Ltd,Manufacturing and Exporting,\nමම PrecisionTools Ltd සමාගම සඳහා ව්‍යාපාර \n...,\nI want to make a business proposal for \nPre...,Yes,"Details on specific \ntools produced, \ncompet...",Existing Business,\nExecutive Summary\n\nPrecisionTools Ltd spec...,
4,5,Navi,Manufacturing and Exporting,\nමට කතා කරන්න තියෙන ව්‍යාපාර යෝජනාවක් ගැන\n. ...,\nI want to make a business proposal that prov...,No,,Newly Built Business,\nExecutive Summary\nNavi is an emerging cloth...,


In [4]:
# Preprocess dataset: Combine relevant columns into a single input string
df['input_text'] = (
    "Business Name: " + df['Business Name'] + 
    " | Domain: " + df['Domain'] + 
    " | User Input (English): " + df['User input English-in English letters'] + 
    " | Business Type: " + df['Existing Business or Newly Built Business']
)
df['output_text'] = df['System output']

In [5]:
# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['input_text'], df['output_text'], test_size=0.2, random_state=42)

In [6]:
# Save training and testing data to files
with open("train.txt", "w") as train_file:
    for inp, out in zip(train_texts, train_labels):
        train_file.write(f"{inp}\n{out}\n")

with open("test.txt", "w") as test_file:
    for inp, out in zip(test_texts, test_labels):
        test_file.write(f"{inp}\n{out}\n")

In [7]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can choose a different model, like "gpt-neo"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [8]:
def convert_to_utf8(input_file, output_file):
    with open(input_file, "rb") as f:  # Open the file in binary mode
        content = f.read()
    with open(output_file, "w", encoding="utf-8", errors="replace") as f:
        f.write(content.decode(errors="replace"))

# Convert the problematic file
convert_to_utf8("train.txt", "train_utf8.txt")
convert_to_utf8("test.txt", "test_utf8.txt")

# Create datasets
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

# Correctly handle encoding issues when loading datasets
train_dataset = load_dataset("train_utf8.txt", tokenizer)
test_dataset = load_dataset("test_utf8.txt", tokenizer)




In [9]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
)


In [11]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [12]:
# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")

100%|██████████| 402/402 [56:16<00:00,  8.40s/it]


{'train_runtime': 3376.5515, 'train_samples_per_second': 0.951, 'train_steps_per_second': 0.119, 'train_loss': 2.282405720421331, 'epoch': 3.0}


In [19]:
# Save the tokenizer files
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.json',
 './fine_tuned_model\\merges.txt',
 './fine_tuned_model\\added_tokens.json')

In [20]:
# Testing the model
def generate_text(input_text, model, tokenizer, max_length=1000):
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [21]:
# Example Test
test_input = "Business Name: ABC Corp | Domain: IT | User Input (English): Our agricultural activities focus on using natural fertilizers, indoor farming, and methods that protect biodiversity. We aim to increase our yield by using modern technology.We estimate our first-year revenue to be $50,000, with an expected annual growth rate of 20%. According to our timeline, the first harvest is expected within 6 months, and full-scale production is planned to begin within a year | Business Type: Newly Built"
generated_text = generate_text(test_input, model, tokenizer)
print("Generated Proposal:", generated_text)

Generated Proposal: Business Name: ABC Corp | Domain: IT | User Input (English): Our agricultural activities focus on using natural fertilizers, indoor farming, and methods that protect biodiversity. We aim to increase our yield by using modern technology.We estimate our first-year revenue to be $50,000, with an expected annual growth rate of 20%. According to our timeline, the first harvest is expected within 6 months, and full-scale production is planned to begin within a year | Business Type: Newly Built Business | Domain: Industry and User Input (English): We are seeking $50,000 in funding to expand our operations, expand our production capacity, and expand our production capacity.



We are seeking an initial investment of $50,000 to cover the initial production and marketing costs, and $100,000 to cover the marketing costs.



We will also establish a website, Facebook, and a social media presence to promote our products and grow our audience.






































In [15]:
import torch
import evaluate
from transformers import TextDataset

# Load evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

In [16]:
# Add a padding token to the tokenizer
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Update model's embedding size to include the new special tokens
model.resize_token_embeddings(len(tokenizer))

def calculate_perplexity(model, tokenizer, dataset):
    """
    Calculates perplexity for the test dataset.
    """
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    count = 0

    for sample in dataset:
        # Tokenize input and output texts
        inputs = tokenizer(sample['input_text'], return_tensors="pt", truncation=True, padding=True)
        labels = tokenizer(sample['output_text'], return_tensors="pt", truncation=True, padding=True).input_ids

        # Check for empty labels
        if labels.size(1) == 0:
            print(f"Skipping sample with empty output_text: {sample}")
            continue

        # Ensure labels match input device
        labels = labels.to(inputs["input_ids"].device)

        with torch.no_grad():
            # Forward pass with labels
            outputs = model(**inputs, labels=labels)
            total_loss += outputs.loss.item()
            count += 1

    # Avoid division by zero
    if count == 0:
        raise ValueError("No valid samples found. Check your dataset.")

    return torch.exp(torch.tensor(total_loss / count)).item()


In [17]:
# BLEU and ROUGE evaluation
def evaluate_metrics(model, tokenizer, test_samples):
    """
    Evaluates the model using BLEU and ROUGE metrics.
    """
    predictions = []
    references = []

    for sample in test_samples:
        input_text = sample["input_text"]
        reference_text = sample["output_text"]

        # Generate text using the fine-tuned model
        generated_text = generate_text(input_text, model, tokenizer)

        predictions.append(generated_text)
        references.append(reference_text)

    # Compute BLEU and ROUGE scores
    bleu_result = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    rouge_result = rouge.compute(predictions=predictions, references=references)

    return bleu_result, rouge_result

In [18]:
print("Loading and validating test samples...")
test_samples = []
with open("test_utf8.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    for i in range(0, len(lines), 2):
        input_text = lines[i].strip()
        output_text = lines[i + 1].strip() if i + 1 < len(lines) else ""

        # Skip invalid samples
        if not input_text or not output_text or input_text.lower() == "nan" or output_text.lower() == "nan":
            print(f"Skipping invalid sample: {{'input_text': '{input_text}', 'output_text': '{output_text}'}}")
            continue

        test_samples.append({"input_text": input_text, "output_text": output_text})

if not test_samples:
    raise ValueError("No valid samples found. Check the test dataset format.")

print(f"Loaded {len(test_samples)} valid test samples.")


# Calculate perplexity
print("Calculating Perplexity...")
perplexity = calculate_perplexity(model, tokenizer, test_samples)
print(f"Perplexity: {perplexity}")

# Evaluate BLEU and ROUGE
print("Evaluating BLEU and ROUGE scores...")
bleu_result, rouge_result = evaluate_metrics(model, tokenizer, test_samples)

# Print final performance metrics
print("\nPerformance Metrics:")
print(f"Perplexity: {perplexity}")
print(f"BLEU Score: {bleu_result['bleu']}")
print(f"ROUGE Scores: {rouge_result}")


Loading and validating test samples...
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text': 'nan', 'output_text': ''}
Skipping invalid sample: {'input_text

ValueError: No valid samples found. Check the test dataset format.