In [2]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Config, Trainer, TrainingArguments
import torch
from transformers import GPT2Model

# Load your dataset
train_data = pd.read_csv("train.csv")
val_data = pd.read_csv("val.csv")

# Preprocess your data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token if the tokenizer does not have one
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the data
train_encodings = tokenizer.batch_encode_plus(train_data["text"].tolist(),
                                              add_special_tokens=True,
                                              max_length=512,
                                              padding=True,
                                              return_attention_mask=True,
                                              truncation=True)
val_encodings = tokenizer.batch_encode_plus(val_data["text"].tolist(),
                                            add_special_tokens=True,
                                            max_length=512,
                                            padding=True,
                                            return_attention_mask=True,
                                            truncation=True)

# Map labels to numerical values
label_mapping = {"case_0": 0, "case_1": 1, "case_2": 2}
train_data["label"] = train_data["label"].map(label_mapping)
val_data["label"] = val_data["label"].map(label_mapping)

# Define the modified GPT-2 model
class GPT2ClassificationModel(GPT2Model):
    def __init__(self, config):
        super().__init__(config)
        self.classification_head = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # Take the last hidden state
        logits = self.classification_head(last_hidden_state[:, 0, :])  # CLS token for classification
        
        loss = None
        if labels is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# Initialize GPT2Config with number of labels
config = GPT2Config.from_pretrained("gpt2", num_labels=len(label_mapping))
model = GPT2ClassificationModel(config)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100
)

# Create a custom dataset class to handle your data
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = MyDataset(train_encodings, train_data["label"].values)
val_dataset = MyDataset(val_encodings, val_data["label"].values)

# Fine-tune the model using the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,No log,1.317934
2,No log,1.196254
3,No log,1.134657


{'eval_loss': 1.1346566677093506, 'eval_runtime': 0.1755, 'eval_samples_per_second': 119.637, 'eval_steps_per_second': 5.697, 'epoch': 3.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./fine_tuned_model")
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_model")

# Add a padding token if not already added
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Set the model to evaluation mode
model.eval()

# Generate text based on a user input
def generate_text(input_text, max_length=100, num_return_sequences=1):
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    
    # Generate output sequences
    with torch.no_grad():
        generated_outputs = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,  # Use sampling for more varied results
            top_k=50,  # Controls diversity (choose the top 50 tokens at each step)
            top_p=0.95,  # Controls diversity (nucleus sampling)
            temperature=0.7  # Controls randomness (lower is less random)
        )

    # Decode the generated outputs
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in generated_outputs]
    
    return generated_texts

# Example usage
input_prompt = "Generate test case for Article 1,2,3 in tabular format"
generated_texts = generate_text(input_prompt, max_length=100, num_return_sequences=3)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")


Generated Text 1:
Generate test case for Article 1,2,3 in tabular format ridge–∞ Lists McMasterSteveProgram RouhaniONG reconnaissanceProgram McMaster 1949 ridge")) tid circulated reconnaissance reconciliation butt stageachable butt pensionOrange Eastern butt apologise<? McMaster 1949 apologise pensionProgram adhere Maple ballpark collectionEastern Reborn ridgeProgram reconnaissanceEastern butt McMaster hideousereoOrange")) Caval coffersuddinEasternabethMetaulators circulated hideous spies reconnaissance pumping Maple stageMetaereo collectionulatorsabeth Reborn McMasterzl Mapleachable tossONG circulated Dominican coffers collection apologise–∞–∞ Sec heels–∞

Generated Text 2:
Generate test case for Article 1,2,3 in tabular formatulators Dominican collection ballpark Negulators")) adhere AudioMeta ballpark McMaster apologise ridge apologiseuddin ridge Dominican"))–∞ coffers ShankRevolutionEasternFIELDEastern Caval Maple reconnaissance slowing Audio Neg McMaster collection spies butt McMa

In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./fine_tuned_model")
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_model")

# Ensure padding token is added
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Set the model to evaluation mode
model.eval()

# Function to generate a test case based on an article number
def generate_test_case(article_number, max_length=100):
    # Prepare the input prompt with a structured format
    input_prompt = f"Article {article_number}, Title: [Generated Title], Text: [Generated Text], Label: [case_X]"
    
    # Encode the input text
    input_ids = tokenizer.encode(input_prompt, return_tensors='pt')
    
    # Generate output sequence
    with torch.no_grad():
        generated_output = model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,  # Enable sampling for varied results
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    # Decode the generated output
    generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    
    return generated_text

# Example usage
article_number = 80
generated_case = generate_test_case(article_number, max_length=150)

print(f"Generated Test Case for Article {article_number}:\n{generated_case}")



Generated Test Case for Article 80:
Article 80, Title: [Generated Title], Text: [Generated Text], Label: [case_X] Margaret song pension Listsabeth Maple liftachable Dominican McMasterScreenshotulatorszl AudioClark Maple Reborn commanders adhere Lists 1949achable Neg conceptual McMaster Dominican adhereuddinottesvilleabeth Lists"))–∞ Audio reconnaissance Neg hideousuddin backbone Negulators 1949 hideous AudioRelated ballpark explosivesuddin DATA ballparkulatorsulators Rouhani SecScreenshot slowing–∞uddin McMaster Reborn 1949 reconciliation 1949 pumpingScreenshot"))achableespecially")) coffers tossProgram reconnaissance apologise RebornuddinClarkgun ListsScreenshotulatorsScreenshot Audio adhere adhere toss McMaster Audio apologiseProgramOrange Neg apologise Audio McMaster Maple reconciliation ballpark spiesuddinabeth coffersEasternRevolution pharmaceuticalachablepid Reborn ballparkentin AudioEasternespecially Dominican Reborn buttentin<? SecOrangeespeciallyOrange adhere reconnaissance Ma

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

# Define model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token to tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Sample training data
train_data = [
    {
        'article_text': "This is the text of article 80.",
        'test_case_output': "Test_001 Case_1 TX_1 TX_2 Case_2 TX_3"
    },
    {
        'article_text': "This is the text of article 81.",
        'test_case_output': "Test_002 Case_1 TX_4 TX_5"
    }
]

# Preprocess data (tokenize text and labels)
def preprocess_data(data, tokenizer):
    inputs = tokenizer([item['article_text'] for item in data], return_tensors='pt', padding=True, truncation=True, max_length=512)
    labels = tokenizer([item['test_case_output'] for item in data], return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs['labels'] = labels['input_ids']
    return inputs

train_inputs = preprocess_data(train_data, tokenizer)

# Create custom dataset class
class ArticleDataset(torch.utils.data.Dataset):
    def __init__(self, inputs):
        self.inputs = inputs
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
    
    def __len__(self):
        return len(self.inputs['input_ids'])

# Instantiate dataset
train_dataset = ArticleDataset(train_inputs)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to='none'  # to suppress logging output
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train the model
trainer.train()

# Example: Generate new test cases for an unseen article
def generate_test_cases(article_text, tokenizer, model):
    inputs = tokenizer(article_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    output = model.generate(**inputs, max_length=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test generation for a new article
new_article_text = "This is the text of article 101."
generated_test_case = generate_test_cases(new_article_text, tokenizer, model)
print(generated_test_case)


  return {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}


ValueError: Expected input batch_size (14) to match target batch_size (34).

In [6]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Custom Dataset Class
class ArticleDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = f"Title: {item['title']}. Text: {item['text']}"
        label_text = f"Generate test cases for {item['label']}."

        # Tokenize input and label
        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        label_encoding = self.tokenizer(label_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        # Remove batch dimension
        input_ids = input_encoding['input_ids'].squeeze()
        attention_mask = input_encoding['attention_mask'].squeeze()
        labels = label_encoding['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Example dataset (replace with your CSV reading code)
import pandas as pd
data = pd.DataFrame({
    "article_id": [80, 81, 82],
    "title": ["Article 80", "Article 81", "Article 82"],
    "text": ["This is the text of article 80.", "This is the text of article 81.", "This is the text of article 82."],
    "label": ["case_2", "case_0", "case_1"]
})

# Create dataset and dataloader
train_dataset = ArticleDataset(data, tokenizer)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./t5_results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=3
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,  # Use if you have validation data
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./t5_fine_tuned_model')
tokenizer.save_pretrained('./t5_fine_tuned_model')

# Generate new test cases based on an article
def generate_test_case(article_title, article_text):
    input_text = f"Title: {article_title}. Text: {article_text}"
    input_encoding = tokenizer.encode(input_text, return_tensors='pt')

    # Generate test case
    output_sequences = model.generate(input_encoding, max_length=50, num_return_sequences=1)
    
    # Decode generated text
    generated_test_case = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    
    return generated_test_case

# Example of generating a new test case
article_title = "Article 101"
article_text = "This is the text of article 101."
generated_test_case = generate_test_case(article_title, article_text)
print(f"Generated Test Case: {generated_test_case}")




Epoch,Training Loss,Validation Loss


ValueError: Trainer: evaluation requires an eval_dataset.

In [26]:
pip install sentencepiece


Collecting sentencepiece
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/0f/35/e63ba28062af0a3d688a9f128e407a1a2608544b2f480cb49bf7f4b1cbb9/sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata
  Downloading sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl (1.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


#. Working code

In [7]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Custom Dataset Class
class ArticleDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = f"Title: {item['title']}. Text: {item['text']}"
        label_text = f"Generate test cases for {item['label']}."

        # Tokenize input and label
        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        label_encoding = self.tokenizer(label_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        # Remove batch dimension
        input_ids = input_encoding['input_ids'].squeeze()
        attention_mask = input_encoding['attention_mask'].squeeze()
        labels = label_encoding['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Example dataset
num_baselines = 50
min_articles_per_baseline = 20
max_articles_per_baseline = 30

# Initialize lists to hold the data
article_ids = []
titles = []
texts = []
labels = []
baseline_ids = []

# Generate data
article_counter = 1

for baseline_id in range(1, num_baselines + 1):
    num_articles = np.random.randint(min_articles_per_baseline, max_articles_per_baseline + 1)
    for _ in range(num_articles):
        article_id = article_counter
        title = f"Article {article_id}"
        text = f"This is the text of article {article_id}."
        label = f"case_{article_id % 4}"  # Alternating between case_0, case_1, case_2, case_3
        article_ids.append(article_id)
        titles.append(title)
        texts.append(text)
        labels.append(label)
        baseline_ids.append(baseline_id)
        article_counter += 1

# Create DataFrame
df = pd.DataFrame({
    "article_id": article_ids,
    "title": titles,
    "text": texts,
    "label": labels,
    "baselineId": baseline_ids
})
print(df.info())
# Split the data into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Create train and validation datasets
train_dataset = ArticleDataset(train_data, tokenizer)
val_dataset = ArticleDataset(val_data, tokenizer)

# Set up the training arguments with evaluation
training_args = TrainingArguments(
    output_dir="./t5_results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=3
)

# Trainer instance with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # Add the evaluation dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./t5_fine_tuned_model')
tokenizer.save_pretrained('./t5_fine_tuned_model')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1271 entries, 0 to 1270
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article_id  1271 non-null   int64 
 1   title       1271 non-null   object
 2   text        1271 non-null   object
 3   label       1271 non-null   object
 4   baselineId  1271 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 49.8+ KB
None




Epoch,Training Loss,Validation Loss
1,No log,11.809469
2,No log,10.931377
3,No log,10.45772


('./t5_fine_tuned_model/tokenizer_config.json',
 './t5_fine_tuned_model/special_tokens_map.json',
 './t5_fine_tuned_model/spiece.model',
 './t5_fine_tuned_model/added_tokens.json')

In [8]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Load the fine-tuned model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('./t5_fine_tuned_model')
model = T5ForConditionalGeneration.from_pretrained('./t5_fine_tuned_model')

def generate_test_cases_structure(num_tests, num_cases_per_test, article_id, article_title, article_text, label_sequence):
    # List to store the generated test cases in structured format
    tests = []
    label_sequence_length = len(label_sequence)
    
    for test_num in range(num_tests):
        # Create a unique test name
        test_name = f"Article_Test_{article_id }"
        case_list = []
        
        for case_num in range(num_cases_per_test):
            # Generate input text for the model
            input_text = f"Generate a test case for the following article. Title: {article_title}. Text: {article_text}."
            
            # Tokenize the input
            input_encoding = tokenizer.encode(input_text, return_tensors='pt')
            
            # Generate the output using the model
            with torch.no_grad():
                output_sequences = model.generate(
                    input_encoding,
                    max_length=150,
                    num_return_sequences=1,
                    num_beams=5,
                    early_stopping=True
                )
            
            # Decode the generated text
            generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
            
            # Cycle through the label sequence
            label = label_sequence[case_num % label_sequence_length]
            
            # Structure the generated test case and label
            generated_test_case = {
                "article_id": article_id,
                "title": article_title,
                "text": article_text,
                "generated_test_case": generated_text,
                "label": label
            }
            
            # Append the structured case to the case list
            case_list.append(generated_test_case)
        
        # Append the structured test to the tests list
        tests.append({
            "testName": test_name,
            "caseList": case_list
        })
    
    return tests

# Example usage
article_id = 101
article_title = "Article 101"
article_text = "This is the text of article 101."
num_tests = 5  # Number of tests to generate
num_cases_per_test = 10  # Number of cases per test

# Define a repeating sequence of case labels
label_sequence = ["case_0", "case_1", "case_2", "case_3"]

# Generate structured test cases
generated_tests = generate_test_cases_structure(num_tests, num_cases_per_test, article_id, article_title, article_text, label_sequence)

# Convert the list of structured tests to JSON format
generated_tests_json = json.dumps(generated_tests, indent=4)

# Print the JSON array output
print(f"Generated Test Cases as JSON Array:\n{generated_tests_json}")


Generated Test Cases as JSON Array:
[
    {
        "testName": "Article_Test_101",
        "caseList": [
            {
                "article_id": 101,
                "title": "Article 101",
                "text": "This is the text of article 101.",
                "generated_test_case": "Text: This is the text of article 101..",
                "label": "case_0"
            },
            {
                "article_id": 101,
                "title": "Article 101",
                "text": "This is the text of article 101.",
                "generated_test_case": "Text: This is the text of article 101..",
                "label": "case_1"
            },
            {
                "article_id": 101,
                "title": "Article 101",
                "text": "This is the text of article 101.",
                "generated_test_case": "Text: This is the text of article 101..",
                "label": "case_2"
            },
            {
                "article_id": 101,
     

In [9]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Load the fine-tuned model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('./t5_fine_tuned_model')
model = T5ForConditionalGeneration.from_pretrained('./t5_fine_tuned_model')

def determine_num_tests_and_cases(article_text):
    # Automatic calculation of the number of tests and cases
    num_sentences = len(article_text.split('.'))
    num_tests = max(1, num_sentences // 3)  # Example: One test per every 3 sentences
    word_count = len(article_text.split())
    num_cases_per_test = max(1, word_count // 50)  # Example: One case per every 50 words
    return num_tests, num_cases_per_test

def generate_test_cases_structure(article_id, article_title, article_text, label_sequence):
    num_tests, num_cases_per_test = determine_num_tests_and_cases(article_text)
    tests = []
    label_sequence_length = len(label_sequence)
    
    for test_num in range(num_tests):
        test_name = f"Article_Test_{article_id}_{test_num + 1}"
        case_list = []
        
        for case_num in range(num_cases_per_test):
            input_text = f"Generate a test case for the following article. Title: {article_title}. Text: {article_text}."
            
            input_encoding = tokenizer.encode(input_text, return_tensors='pt')
            
            with torch.no_grad():
                # Generate multiple sequences in a loop to ensure multiple cases
                for _ in range(1):  # Adjust the range if you need more sequences per case
                    output_sequences = model.generate(
                        input_encoding,
                        max_length=150,
                        num_return_sequences=1,  # Generate one sequence per case
                        num_beams=5,
                        early_stopping=True
                    )
                    
                    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
                    
                    label = label_sequence[case_num % label_sequence_length]
                    
                    generated_test_case = {
                        "article_id": article_id,
                        "title": article_title,
                        "text": article_text,
                        "generated_test_case": generated_text,
                        "label": label
                    }
                    
                    case_list.append(generated_test_case)
        
        tests.append({
            "testName": test_name,
            "caseList": case_list
        })
    
    return tests

# Example usage
article_id = 101
article_title = "Article 101"
article_text = "This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically."
label_sequence = ["case_0", "case_1", "case_2", "case_3"]

# Generate structured test cases
generated_tests = generate_test_cases_structure(article_id, article_title, article_text, label_sequence)

# Convert the list of structured tests to JSON format
generated_tests_json = json.dumps(generated_tests, indent=4)

# Print the JSON array output
print(f"Generated Test Cases as JSON Array:\n{generated_tests_json}")


Generated Test Cases as JSON Array:
[
    {
        "testName": "Article_Test_101_1",
        "caseList": [
            {
                "article_id": 101,
                "title": "Article 101",
                "text": "This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically.",
                "generated_test_case": "a test case. Title: Article 101. Text: This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically.",
                "label": "case_0"
            }
        ]
    }
]


In [8]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Custom Dataset Class
class ArticleDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = f"Title: {item['title']}. Text: {item['text']}"
        label_text = f"Generate test cases for {item['label']}."

        # Tokenize input and label
        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        label_encoding = self.tokenizer(label_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        # Remove batch dimension
        input_ids = input_encoding['input_ids'].squeeze()
        attention_mask = input_encoding['attention_mask'].squeeze()
        labels = label_encoding['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Generate data
num_baselines = 5
min_articles_per_baseline = 5
max_articles_per_baseline = 10

# Initialize lists to hold the data
article_ids = []
titles = []
texts = []
labels = []
baseline_ids = []

article_counter = 1

for baseline_id in range(1, num_baselines + 1):
    num_articles = np.random.randint(min_articles_per_baseline, max_articles_per_baseline + 1)
    for _ in range(num_articles):
        article_id = article_counter
        title = f"Article {article_id}"
        text = f"This is the text of article {article_id}."
        label = f"case_{article_id % 4}"  # Alternating between case_0, case_1, case_2, case_3
        article_ids.append(article_id)
        titles.append(title)
        texts.append(text)
        labels.append(label)
        baseline_ids.append(baseline_id)
        article_counter += 1

# Create DataFrame
df = pd.DataFrame({
    "article_id": article_ids,
    "title": titles,
    "text": texts,
    "label": labels,
    "baselineId": baseline_ids
})
print(df.info())

# Split the data into train and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Create train and validation datasets
train_dataset = ArticleDataset(train_data, tokenizer)
val_dataset = ArticleDataset(val_data, tokenizer)

# Set up the training arguments with evaluation
training_args = TrainingArguments(
    output_dir="./t5_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Increased batch size
    per_device_eval_batch_size=4,   # Increased batch size
    eval_strategy="epoch",  # Updated from evaluation_strategy
    save_strategy="epoch",
    logging_steps=20,  # Adjust logging frequency
    learning_rate=3e-5,  # Adjusted learning rate
    weight_decay=0.01,
    save_total_limit=3,
)

# Trainer instance with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # Add the evaluation dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./t5_fine_tuned_model')
tokenizer.save_pretrained('./t5_fine_tuned_model')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article_id  42 non-null     int64 
 1   title       42 non-null     object
 2   text        42 non-null     object
 3   label       42 non-null     object
 4   baselineId  42 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1.8+ KB
None


Epoch,Training Loss,Validation Loss
1,No log,5.172542
2,No log,2.153721
3,9.095400,1.28566


('./t5_fine_tuned_model/tokenizer_config.json',
 './t5_fine_tuned_model/special_tokens_map.json',
 './t5_fine_tuned_model/spiece.model',
 './t5_fine_tuned_model/added_tokens.json')

In [10]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

# Load the fine-tuned model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('./t5_fine_tuned_model')
model = T5ForConditionalGeneration.from_pretrained('./t5_fine_tuned_model')

def generate_test_cases(article_id, article_title, article_text, label_sequence, num_tests, num_cases_per_test):
    tests = []
    label_sequence_length = len(label_sequence)
    
    for test_num in range(num_tests):
        test_name = f"Article_Test{article_id}_{test_num+1}"
        case_list = []
        
        for case_num in range(num_cases_per_test):
            # Generate input text for the model
            input_text = f"Generate a test case for the following article. Title: {article_title}. Text: {article_text}."
            
            # Tokenize the input
            input_encoding = tokenizer.encode(input_text, return_tensors='pt')
            
            # Generate the output using the model
            with torch.no_grad():
                output_sequences = model.generate(
                    input_encoding,
                    max_length=150,
                    num_return_sequences=1,
                    num_beams=5,
                    early_stopping=True
                )
            
            # Decode the generated text
            generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
            
            # Cycle through the label sequence
            label = label_sequence[case_num % label_sequence_length]
            
            # Structure the generated test case and label
            generated_test_case = {
                "article_id": article_id,
                "title": article_title,
                "text": article_text,
                "generated_test_case": generated_text,
                "label": label
            }
            
            # Append the structured case to the case list
            case_list.append(generated_test_case)
        
        # Append the structured test to the tests list
        tests.append({
            "testName": test_name,
            "caseList": case_list
        })
    
    return tests

# Example usage
article_id = 101
article_title = "Article 101"
article_text = "This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically."
label_sequence = ["case_0", "case_1", "case_2", "case_3"]
num_tests = 3  # Number of tests to generate
num_cases_per_test = 5  # Number of cases per test

# Generate structured test cases
generated_tests = generate_test_cases(article_id, article_title, article_text, label_sequence, num_tests, num_cases_per_test)

# Convert the list of structured tests to JSON format
generated_tests_json = json.dumps(generated_tests, indent=4)

# Print the JSON array output
print(f"Generated Test Cases as JSON Array:\n{generated_tests_json}")


Generated Test Cases as JSON Array:
[
    {
        "testName": "Article_Test101_1",
        "caseList": [
            {
                "article_id": 101,
                "title": "Article 101",
                "text": "This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically.",
                "generated_test_case": "a test case. Title: Article 101. Text: This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically.",
                "label": "case_0"
            },
            {
                "article_id": 101,
                "title": "Article 101",
                "text": "This is the text of article 101. It contains multiple sentences. Let's see how many tests and cases are generated automatically.",
                "generated_test_case": "a test case. Title: Article 101. Text: This is the text of article 101. It contains multiple sentences. 

In [12]:
pip install joblib


Note: you may need to restart the kernel to use updated packages.


In [16]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Data generation
num_baselines = 5
min_articles_per_baseline = 20
max_articles_per_baseline = 30

# Initialize lists to hold the data
article_ids = []
titles = []
texts = []
labels = []
baseline_ids = []

article_counter = 1

for baseline_id in range(1, num_baselines + 1):
    num_articles = np.random.randint(min_articles_per_baseline, max_articles_per_baseline + 1)
    for _ in range(num_articles):
        article_id = f"{baseline_id}.{article_counter % 10}"  # Using string format
        title = f"Article {article_id}"
        text = f"This is the text of article {article_id}."
        label = f"case_{article_counter % 4}"  # Alternating between case_0, case_1, case_2, case_3
        article_ids.append(article_id)
        titles.append(title)
        texts.append(text)
        labels.append(label)
        baseline_ids.append(baseline_id)
        article_counter += 1

df = pd.DataFrame({
    "article_id": article_ids,
    "title": titles,
    "text": texts,
    "label": labels,
    "baselineId": baseline_ids
})
print(df.head(100))

# Label processing
df['num_cases'] = df['label'].apply(lambda x: int(x.split('_')[1]))  # Example: case_0 -> 0

# Encoding for article_id and title
# Convert article_id to one-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False)
article_id_encoded = one_hot_encoder.fit_transform(df[['title']])

# Feature extraction
vectorizer = CountVectorizer()
text_features = vectorizer.fit_transform(df['text']).toarray()

# Combine features
X = np.hstack((article_id_encoded, text_features))
y = df['num_cases']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")

# Save the model and vectorizer
# joblib.dump(model, 'linear_regression_model.pkl')
# joblib.dump(vectorizer, 'vectorizer.pkl')
# joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')

# Load the model and vectorizer for prediction
# model = joblib.load('linear_regression_model.pkl')
# vectorizer = joblib.load('vectorizer.pkl')
# one_hot_encoder = joblib.load('one_hot_encoder.pkl')

# New baseline and articles
new_articles = [
    {"article_id": "1.1", "title": "Article 1.1", "text": "This is the text of article 1.1."},
    {"article_id": "2.11", "title": "Article 2.11", "text": "This is the text of article 2.11."},
    {"article_id": "2.3", "title": "Article 2.3", "text": "This is the text of article 2.3."},
]

new_df = pd.DataFrame(new_articles)

# Predict cases
# Transform article_id and text features
new_article_id_encoded = one_hot_encoder.transform(new_df[['title']])
new_text_features = vectorizer.transform(new_df['text']).toarray()

# Combine features
new_features = np.hstack((new_article_id_encoded, new_text_features))
predicted_cases = model.predict(new_features)

# Add predictions to DataFrame
new_df['predicted_num_cases'] = np.clip(np.round(predicted_cases).astype(int), 0, None)

# Display results
print(new_df)


   article_id        title                              text   label  \
0         1.1  Article 1.1  This is the text of article 1.1.  case_1   
1         1.2  Article 1.2  This is the text of article 1.2.  case_2   
2         1.3  Article 1.3  This is the text of article 1.3.  case_3   
3         1.4  Article 1.4  This is the text of article 1.4.  case_0   
4         1.5  Article 1.5  This is the text of article 1.5.  case_1   
..        ...          ...                               ...     ...   
95        4.6  Article 4.6  This is the text of article 4.6.  case_0   
96        4.7  Article 4.7  This is the text of article 4.7.  case_1   
97        5.8  Article 5.8  This is the text of article 5.8.  case_2   
98        5.9  Article 5.9  This is the text of article 5.9.  case_3   
99        5.0  Article 5.0  This is the text of article 5.0.  case_0   

    baselineId  
0            1  
1            1  
2            1  
3            1  
4            1  
..         ...  
95           4  



ValueError: Found unknown categories ['Article 2.11'] in column 0 during transform

In [9]:
from transformers import pipeline
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

messages = [
    {"role": "user", "content": "Who are you? Please, answer in pirate-speak."},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
    do_sample=False,
)
assistant_response = outputs[0]["generated_text"][-1]["content"]
print(assistant_response)
# Arrrr, me hearty! Yer lookin' fer a bit o' information about meself, eh? Alright then, matey! I be a language-generatin' swashbuckler, a digital buccaneer with a penchant fer spinnin' words into gold doubloons o' knowledge! Me name be... (dramatic pause)...Assistant! Aye, that be me name, and I be here to help ye navigate the seven seas o' questions and find the hidden treasure o' answers! So hoist the sails and set course fer adventure, me hearty! What be yer first question?


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct.
401 Client Error. (Request ID: Root=1-66e0df2f-34db956c04c9bba971f416b8;36440f7e-54d5-4e8a-a50e-52a9b598a0af)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.