In [13]:
import pandas as pd

In [14]:
# Load the dataset (use your actual file path here)
df = pd.read_excel('transformed_dataset.xlsx')

In [15]:
# Prepare the dataset in a format suitable for fine-tuning (prompt and response)
train_data = []
for index, row in df.iterrows():
    prompt = row['User input English-in English letters']
    response = row['System output']
    train_data.append({"prompt": prompt, "response": response})

In [16]:
df.head()

Unnamed: 0,Business Name,Domain,User input English-in English letters,System output
0,Green Grow Farms,Agriculture,Generate business report contents for the comp...,Executive Summary: Green Grow Farms is a newly...
1,Holiday Mansion,Tourism and Hospitality,Generate business report contents for the comp...,"\nExecutive Summary: \nHoliday Mansion, a luxu..."
2,ABC Textiles,Manufacturing and Exporting,Generate business report contents for the comp...,\nExecutive Summary\nABC Manufacturing is a le...
3,PrecisionTools Ltd,Manufacturing and Exporting,Generate business report contents for the comp...,\nExecutive Summary\n\nPrecisionTools Ltd spec...
4,Navi,Manufacturing and Exporting,Generate business report contents for the comp...,\nExecutive Summary\nNavi is an emerging cloth...


In [17]:
# Convert to DataFrame
train_df = pd.DataFrame(train_data)

In [18]:
train_df

Unnamed: 0,prompt,response
0,Generate business report contents for the comp...,Executive Summary: Green Grow Farms is a newly...
1,Generate business report contents for the comp...,"\nExecutive Summary: \nHoliday Mansion, a luxu..."
2,Generate business report contents for the comp...,\nExecutive Summary\nABC Manufacturing is a le...
3,Generate business report contents for the comp...,\nExecutive Summary\n\nPrecisionTools Ltd spec...
4,Generate business report contents for the comp...,\nExecutive Summary\nNavi is an emerging cloth...
...,...,...
95,Generate business report contents for the comp...,\n\nCompany Overview:\n\nLanka Reads Bookshop ...
96,Generate business report contents for the comp...,\n\nCompany Overview\nTechSolve Innovations is...
97,Generate business report contents for the comp...,\n1. Company Overview\nLifeCare Wellness Cente...
98,Generate business report contents for the comp...,\nCompany Overview\nSoloMagic is an establishe...


In [20]:
# Save the fine-tuning data as a JSON file (Hugging Face prefers this format)
train_df.to_json('train_data.json', orient='records', lines=True)

print("Dataset prepared for fine-tuning and saved as 'train_data.json'.")

Dataset prepared for fine-tuning and saved as 'train_data.json'.


In [21]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
# Load the dataset from the JSON file
train_dataset = load_dataset('json', data_files='train_data.json', split='train')

Generating train split: 100 examples [00:00, 2901.59 examples/s]


In [23]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [27]:
# Set pad_token to eos_token (End of Sequence token)
tokenizer.pad_token = tokenizer.eos_token

In [28]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=512, pad_token_id=tokenizer.eos_token_id)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)

print("Tokenization complete.")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'pad_token_id': 50256} not recognized.
Keyword arguments {'p

Tokenization complete.





In [31]:
# Prepare the data collator (handles batching and padding for variable-length inputs)
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [32]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",          # Output directory
    overwrite_output_dir=True,      # Overwrite the output dir
    num_train_epochs=3,             # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per device
    save_steps=10_000,              # Save checkpoint every 10,000 steps
    save_total_limit=2,             # Keep the latest 2 models
    logging_dir="./logs",           # Directory for logs
)

In [33]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

In [34]:
# Start fine-tuning
trainer.train()


100%|██████████| 75/75 [23:57<00:00, 19.16s/it]

{'train_runtime': 1437.3525, 'train_samples_per_second': 0.209, 'train_steps_per_second': 0.052, 'train_loss': 2.1505721028645834, 'epoch': 3.0}





TrainOutput(global_step=75, training_loss=2.1505721028645834, metrics={'train_runtime': 1437.3525, 'train_samples_per_second': 0.209, 'train_steps_per_second': 0.052, 'total_flos': 78387609600000.0, 'train_loss': 2.1505721028645834, 'epoch': 3.0})

In [35]:
# Save the fine-tuned model
trainer.save_model('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

print("Fine-tuning complete and model saved.")

Fine-tuning complete and model saved.


In [36]:
# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')

In [37]:
# Example prompt for testing
prompt = "Generate business report contents for the company 'ABC Corp.' based on the following scope: Our agricultural activities focus on using natural fertilizers, indoor farming, and methods that protect biodiversity. We aim to increase our yield by using modern technology. We estimate our first-year revenue to be $50,000, with an expected annual growth rate of 20%. According to our timeline, the first harvest is expected within 6 months, and full-scale production is planned to begin within a year. Generate the following sections:- Executive Summary - Industry Overview and Trends - Problem Statement - Proposed Solution - Market Analysis - Sustainable Practices - Supply Chain and Distribution - Financial Projections"

In [38]:
# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate a prediction
output = model.generate(inputs['input_ids'], max_length=700, num_return_sequences=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [39]:
# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Business Report Contents:")
print(generated_text)

Generated Business Report Contents:
Generate business report contents for the company 'ABC Corp.' based on the following scope: Our agricultural activities focus on using natural fertilizers, indoor farming, and methods that protect biodiversity. We aim to increase our yield by using modern technology. We estimate our first-year revenue to be $50,000, with an expected annual growth rate of 20%. According to our timeline, the first harvest is expected within 6 months, and full-scale production is planned to begin within a year. Generate the following sections:- Executive Summary - Industry Overview and Trends - Problem Statement - Proposed Solution - Market Analysis - Sustainable Practices - Supply Chain and Distribution - Financial Projections - Implementation Timeline - Conclusion
Generate the following sections:- Executive Summary
- Industry Overview and Trends
- Problem Statement
- Proposed Solution
- Supply Chain and Distribution
- Financial Projections
- Implementation Timeline
- 

### Model Evaluation

In [40]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_model')

# Example input for evaluation
prompt = "Generate business report contents for the company 'ABC Corp.' based on the following scope: Our agricultural activities focus on using natural fertilizers, indoor farming, and methods that protect biodiversity. We aim to increase our yield by using modern technology. We estimate our first-year revenue to be $50,000, with an expected annual growth rate of 20%. According to our timeline, the first harvest is expected within 6 months, and full-scale production is planned to begin within a year. Generate the following sections:- Executive Summary - Industry Overview and Trends - Problem Statement - Proposed Solution - Market Analysis - Sustainable Practices - Supply Chain and Distribution - Financial Projections"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Ensure the model is in evaluation mode
model.eval()

# Forward pass to get loss
with torch.no_grad():
    outputs = model(**inputs, labels=inputs['input_ids'])
    loss = outputs.loss

# Compute Perplexity
perplexity = torch.exp(loss)
print(f"Perplexity: {perplexity.item()}")


Perplexity: 5.639610290527344


In [41]:
from nltk.translate.bleu_score import sentence_bleu

# Example reference and generated text
reference = "Executive Summary: Green Grow Farms is a newly established business focusing on sustainable agricultural practices using organic fertilizers and modern indoor farming techniques to maximize crop yield. Industry Overview and Trends: The agricultural industry in Sri Lanka is moving towards more sustainable and eco-friendly farming practices to meet growing local and international demand. Problem Statement: The rising cost of conventional farming inputs and environmental degradation calls for more sustainable farming techniques. Proposed Agricultural Solution: Green Grow Farms provides innovative farming techniques, including organic fertilizers, controlled indoor farming systems, and practices to preserve biodiversity. Market Analysis: The demand for organic products is on the rise, driven by health-conscious consumers both locally and internationally. Sustainable Farming Practices: The farm will use environmentally friendly methods, focusing on organic inputs, water conservation, and crop rotation to ensure long-term soil fertility. Supply Chain and Distribution: Partnerships with local retailers, organic food suppliers, and export markets are planned to distribute produce efficiently. Financial Projections: Estimated revenue in the first year is $50,000, with anticipated growth of 20% per year. Implementation Timeline: The first harvest is expected within 6 months, with full-scale production to follow within a year. Conclusion: Green Grow Farms aims to contribute to the sustainable agricultural landscape in Sri Lanka, offering eco-friendly solutions to modern farming challenges"
generated = "I want to make a business proposal for Harvest Harvest Farms. Harvest Harvest Farms is a new farming operation based in the heart of the city of Lhasa, located in the heart of the country. We are seeking funding to expand our operations and expand our production capabilities. We plan to use organic fertilizers, organic farming techniques, and sustainable practices to enhance our yield and profitability. We will also use organic fertilizers to enhance our soil quality and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil quality and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil quality and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil quality and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion. We will also use organic fertilizers to enhance our soil fertility and reduce soil erosion."

# Tokenize the sentences (split by words)
reference_tokens = reference.split()
generated_tokens = generated.split()

# Compute BLEU score
score = sentence_bleu([reference_tokens], generated_tokens)
print(f"BLEU Score: {score}")


BLEU Score: 5.615242822883281e-79


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
