In [10]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should return the number of GPUs
print(torch.cuda.get_device_name(0))  # Should show the GPU model

True
1
NVIDIA RTX 5000 Ada Generation


In [11]:
import transformers
import torch
import os
import json
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging
import time
from datetime import timedelta, datetime
import pandas as pd
from dotenv import load_dotenv
import shutil 

import evaluate
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load environment variables
load_dotenv(dotenv_path="../../.env") # path is relative to this script, adjust as needed

True

In [12]:
# setting huggingface token
login(token=os.getenv("HUGGINGFACE_TOKEN"))

os.environ["HF_HOME"] = "D:/huggingface_cache" 
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "D:/huggingface_cache"

print("HF_HOME:", os.getenv("HF_HOME"))
print("TRANSFORMERS_CACHE:", os.getenv("TRANSFORMERS_CACHE"))
print("HUGGINGFACE_HUB_CACHE:", os.getenv("HUGGINGFACE_HUB_CACHE"))

transformers.utils.hub.TRANSFORMERS_CACHE = "D:/huggingface_cache"

HF_HOME: D:/huggingface_cache
TRANSFORMERS_CACHE: D:/huggingface_cache
HUGGINGFACE_HUB_CACHE: D:/huggingface_cache


In [13]:
model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cuda")


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
chunk_sizes = [128, 256, 512, 1024]
questions_num = 2
max_token_list = [128,256,512,1024,2048]


In [15]:
results_df = pd.DataFrame(columns=[
    "chunk_size", "questions_num", "qa_count_mismatch", "total_questions", "token_Size",
    "total_chunks", "success_count", "fail_count",
    "elapsed_time"
])

In [16]:
# Checkpointing to avoid re-running completed configurations
checkpoint_path = "checkpoint.csv"

# Load existing checkpoint if it exists
if os.path.exists(checkpoint_path):
    completed_runs = pd.read_csv(checkpoint_path)
    # Ensure column names are correct
    completed_runs.columns = ["chunk_size", "max_tokens"]
    completed_set = set(zip(
        completed_runs["chunk_size"].astype(int),
        completed_runs["max_tokens"].astype(int)
    ))
else:
    completed_set = set()
    pd.DataFrame(columns=["chunk_size", "max_tokens"]).to_csv(checkpoint_path, index=False)


In [17]:
def power_analysis(chunk_size, max_tokens, qa_results,substring_date):
    """
    Perform power analysis based on the provided parameters for the current run.
    """
    
    # https://huggingface.co/spaces/evaluate-metric/bertscore
    # https://huggingface.co/tasks/sentence-similarity
    # 1 Metric: ROUGE
    rouge = evaluate.load("rouge")

    originals = []
    generations = []

    for doc in qa_results.values():
        for item in doc:
            chunk = item["chunk"]
            qa = item["qa_pairs"]
            for pair in qa:
                originals.append(chunk)  # reference
                generations.append(pair["answer"])  # model-generated answer

    scores = rouge.compute(predictions=generations, references=originals)
    print(f"ROUGE Scores: {scores}")

    # 2 Metric: BERTScore
    bertscore = evaluate.load("bertscore")
    bert_scores = bertscore.compute(predictions=generations, references=originals, model_type="bert-base-uncased", lang="en")
    P = bert_scores["precision"]
    R = bert_scores["recall"]
    F1 = bert_scores["f1"] 

    print(f"BERTScore: {bert_scores}")

    # 3 Metric: STS (Semantic Textual Similarity)
    sts_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    original_embeddings = sts_model.encode(originals, convert_to_tensor=True) 
    generated_embeddings = sts_model.encode(generations, convert_to_tensor=True)
    sts_scores = util.pytorch_cos_sim(original_embeddings, generated_embeddings).diagonal().cpu().tolist()

    print(f"STS Scores: {sts_scores}")

    # save the scores to a CSV file
    scores_df = pd.DataFrame({
        "chunk_size": [chunk_size],
        "max_tokens": [max_tokens],
        "questions_num": [questions_num],
        "rouge1": [scores["rouge1"]],
        "rouge2": [scores["rouge2"]],
        "rougeL": [scores["rougeL"]],
        "rougeLsum": [scores["rougeLsum"]],
        "bert_score_P": [np.mean(P)],
        "bert_score_R": [np.mean(R)],
        "bert_score_F1": [np.mean(F1)],
        "sts_score": [np.mean(sts_scores)]
    })
    
    print("Scores saved to scores.csv")   
    return scores_df

# Convert logs to Panda


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

substring_date = datetime.now().strftime("%Y%m%d") 
# delete existing folder if you want to rerun the script from scratch uncomment the following lines

if os.path.exists(f"Generated_Results/LLAMA3_1/{substring_date}"):
    shutil.rmtree(f"Generated_Results/LLAMA3_1/{substring_date}")

# deleting existing checkpoint file if you want to rerun the script from scratch uncomment the following lines
if os.path.exists(checkpoint_path):
    os.remove(checkpoint_path)
    
for chunk_size in chunk_sizes:
    json_file_path = f"../Yaman/Generate_Paragraphs/Results/extracted_chunks_{chunk_size}_overlap.json"

    if not os.path.exists(json_file_path):
        print(f"Missing input file: {json_file_path}, skipping.")
        continue

    with open(json_file_path, "r", encoding="utf-8") as file:
        chunk_data = json.load(file)


    for max_tokens in max_token_list:
        if (chunk_size, max_tokens) in completed_set:
            print(f"Skipping completed run: Chunk={chunk_size}, Max Tokens={max_tokens}")
            continue

        output_file_path = f"Generated_Results/LLAMA3_1/{substring_date}/generation_log_{chunk_size}_Token_{max_tokens}_Q{questions_num}.json"
        if os.path.exists(output_file_path):
            print(f"Output already exists: {output_file_path}, skipping.")
            continue

        print(f"Starting new run: Chunk={chunk_size}, Max Tokens={max_tokens}")

        qa_results = {}
        total_chunks = 0
        success_count = 0
        fail_count = 0
        total_questions = 0
        qa_count_mismatch = 0

        start_time = time.time()

        for doc_name, chunks in chunk_data.items():
            qa_results[doc_name] = []

            for chunk in chunks[:20]:  # Limit to first 5 chunks for testing
                total_chunks += 1

                prompt = f"""
Generate {questions_num} question-answer pairs based on the following text segment. 
Return the result in valid JSON format as a list of objects.

Text Segment:

{chunk}

Response Format:
[
    {{"question": "generated question", "answer": "generated Answer"}},
]

Question answers should be at least 250 words long.

Do NOT include any explanation or preamble before or after the JSON output.
Return ONLY valid JSON output.

Answer:
                """

                inputs = tokenizer(prompt, return_tensors="pt").to(device)

                try:
                    max_context = model.config.max_position_embeddings
                    input_len = inputs['input_ids'].shape[1]
                    if input_len + max_tokens > max_context:
                        print(f"Skipping chunk (too long): input_len={input_len}")
                        continue

                    with torch.no_grad():
                        output_tokens = model.generate(**inputs, max_new_tokens=max_tokens,pad_token_id=tokenizer.eos_token_id)

                    generated_tokens = output_tokens[0][len(inputs["input_ids"][0]):]
                    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

                    try:
                        qa_pairs = json.loads(generated_text)
                        if isinstance(qa_pairs, list):
                            qa_results[doc_name].append({
                                "chunk": chunk,
                                "qa_pairs": qa_pairs
                            })
                            success_count += 1
                            total_questions += len(qa_pairs)

                            if len(qa_pairs) != questions_num:
                                qa_count_mismatch += 1
                        else:
                            fail_count += 1
                    except json.JSONDecodeError:
                        fail_count += 1

                except Exception as e:
                    print(f"Error generating for chunk: {e}")
                    fail_count += 1

        # Save QA Output
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, "w", encoding="utf-8") as out_file:
            json.dump(qa_results, out_file, indent=4, ensure_ascii=False)

        end_time = time.time()
        elapsed_time = timedelta(seconds=end_time - start_time)

        # Save Results and Checkpoint
        results_df.loc[len(results_df)] = [
            chunk_size, questions_num, qa_count_mismatch, total_questions,
            max_tokens, total_chunks, success_count, fail_count, str(elapsed_time)
        ]
        
        csv_output_path = f"Generated_Results/LLAMA3_1/{substring_date}/results_log.csv"
        results_df.to_csv(csv_output_path, index=False)

        pd.DataFrame([[chunk_size, max_tokens]], columns=["chunk_size", "max_tokens"]) \
            .to_csv(checkpoint_path, mode='a', header=False, index=False)

        print(f"✅ Saved: {output_file_path} | Time: {elapsed_time}")
        scores_df = power_analysis(chunk_size, max_tokens, qa_results,substring_date)
        if os.path.exists(f"Generated_Results/LLAMA3_1/{substring_date}/scores/scores.csv"):
            scores_df.to_csv(f"Generated_Results/LLAMA3_1/{substring_date}/scores/scores.csv", mode='a', header=False, index=False)
        else:
            os.makedirs(f"Generated_Results/LLAMA3_1/{substring_date}/scores", exist_ok=True)
            scores_df.to_csv(f"Generated_Results/LLAMA3_1/{substring_date}/scores/scores.csv", index=False)

print("All runs completed.")

Starting new run: Chunk=128, Max Tokens=128
✅ Saved: Generated_Results/LLAMA3_1/20250504/generation_log_128_Token_128_Q2.json | Time: 0:02:31.527918
ROUGE Scores: {'rouge1': np.float64(0.17033842882389946), 'rouge2': np.float64(0.09971623564239199), 'rougeL': np.float64(0.14107639373633002), 'rougeLsum': np.float64(0.15981158661575667)}
BERTScore: {'precision': [0.9393197894096375, 0.4887295365333557, 0.4887295365333557, 0.6685803532600403, 0.4435420632362366, 0.4469687342643738, 0.6816617250442505, 0.9103200435638428, 0.9211437106132507, 0.7594549059867859, 0.7856696248054504, 0.6982488632202148, 0.5444813370704651, 0.41293397545814514, 0.417388379573822, 0.5649611949920654, 0.5464065074920654, 0.7675166130065918, 0.679031252861023, 0.4438522458076477, 0.5413390398025513, 0.6190294027328491, 0.6190294027328491], 'recall': [0.700722873210907, 0.34385281801223755, 0.34385281801223755, 0.5128637552261353, 0.2652764916419983, 0.2541447877883911, 0.33672839403152466, 0.47884076833724976, 0