In [32]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should return the number of GPUs
print(torch.cuda.get_device_name(0))  # Should show the GPU model

True
1
NVIDIA RTX 5000 Ada Generation


In [33]:
import transformers
import torch
import os
import json
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging
import time
from datetime import timedelta, datetime
import pandas as pd
from dotenv import load_dotenv
import shutil



# Load environment variables
load_dotenv(dotenv_path="../../.env") # path is relative to this script, adjust as needed

True

In [34]:
# setting huggingface token
login(token=os.getenv("HUGGINGFACE_TOKEN"))

os.environ["HF_HOME"] = "D:/huggingface_cache" 
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "D:/huggingface_cache"

print("HF_HOME:", os.getenv("HF_HOME"))
print("TRANSFORMERS_CACHE:", os.getenv("TRANSFORMERS_CACHE"))
print("HUGGINGFACE_HUB_CACHE:", os.getenv("HUGGINGFACE_HUB_CACHE"))

transformers.utils.hub.TRANSFORMERS_CACHE = "D:/huggingface_cache"

HF_HOME: D:/huggingface_cache
TRANSFORMERS_CACHE: D:/huggingface_cache
HUGGINGFACE_HUB_CACHE: D:/huggingface_cache


In [35]:
model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cuda")


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [36]:
chunk_sizes = [128]
questions_num = 2
max_token_list = [256]


In [37]:
results_df = pd.DataFrame(columns=[
    "chunk_size", "questions_num", "qa_count_mismatch", "total_questions", "token_Size",
    "total_chunks", "success_count", "fail_count",
    "elapsed_time"
])

In [42]:
# Checkpointing to avoid re-running completed configurations
checkpoint_path = "checkpoint.csv"

# Load existing checkpoint if it exists
if os.path.exists(checkpoint_path):
    completed_runs = pd.read_csv(checkpoint_path)
    # Ensure column names are correct
    completed_runs.columns = ["chunk_size", "max_tokens"]
    completed_set = set(zip(
        completed_runs["chunk_size"].astype(int),
        completed_runs["max_tokens"].astype(int)
    ))
else:
    completed_set = set()
    pd.DataFrame(columns=["chunk_size", "max_tokens"]).to_csv(checkpoint_path, index=False)


# Convert logs to Panda


In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

substring_date = datetime.now().strftime("%Y%m%d") 
# delete existing folder if you want to rerun the script from scratch uncomment the following lines

if os.path.exists(f"Generated_Results/LLAMA3_1/{substring_date}"):
    shutil.rmtree(f"Generated_Results/LLAMA3_1/{substring_date}")

# deleting existing checkpoint file if you want to rerun the script from scratch uncomment the following lines
if os.path.exists(checkpoint_path):
    os.remove(checkpoint_path)
    
for chunk_size in chunk_sizes:
    json_file_path = f"../Yaman/Generate_Paragraphs/Results/extracted_chunks_{chunk_size}_overlap.json"

    if not os.path.exists(json_file_path):
        print(f"Missing input file: {json_file_path}, skipping.")
        continue

    with open(json_file_path, "r", encoding="utf-8") as file:
        chunk_data = json.load(file)


    for max_tokens in max_token_list:
        if (chunk_size, max_tokens) in completed_set:
            print(f"Skipping completed run: Chunk={chunk_size}, Max Tokens={max_tokens}")
            continue

        output_file_path = f"Generated_Results/LLAMA3_1/{substring_date}/generation_log_{chunk_size}_Token_{max_tokens}_Q{questions_num}.json"
        if os.path.exists(output_file_path):
            print(f"Output already exists: {output_file_path}, skipping.")
            continue

        print(f"Starting new run: Chunk={chunk_size}, Max Tokens={max_tokens}")

        qa_results = {}
        total_chunks = 0
        success_count = 0
        fail_count = 0
        total_questions = 0
        qa_count_mismatch = 0

        start_time = time.time()

        for doc_name, chunks in chunk_data.items():
            qa_results[doc_name] = []

            for chunk in chunks[:10]:  # Limit to first 5 chunks for testing
                total_chunks += 1

                prompt = f"""
Generate {questions_num} question-answer pairs based on the following text segment. 
Return the result in valid JSON format as a list of objects.

Text Segment:

{chunk}

Response Format:
[
    {{"question": "generated question", "answer": "generated Answer"}},
]

Question answers should be at least 250 words long.

Do NOT include any explanation or preamble before or after the JSON output.
Return ONLY valid JSON output.

Answer:
                """

                inputs = tokenizer(prompt, return_tensors="pt").to(device)

                try:
                    max_context = model.config.max_position_embeddings
                    input_len = inputs['input_ids'].shape[1]
                    if input_len + max_tokens > max_context:
                        print(f"Skipping chunk (too long): input_len={input_len}")
                        continue

                    with torch.no_grad():
                        output_tokens = model.generate(**inputs, max_new_tokens=max_tokens)

                    generated_tokens = output_tokens[0][len(inputs["input_ids"][0]):]
                    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

                    try:
                        qa_pairs = json.loads(generated_text)
                        if isinstance(qa_pairs, list):
                            qa_results[doc_name].append({
                                "chunk": chunk,
                                "qa_pairs": qa_pairs
                            })
                            success_count += 1
                            total_questions += len(qa_pairs)

                            if len(qa_pairs) != questions_num:
                                qa_count_mismatch += 1
                        else:
                            fail_count += 1
                    except json.JSONDecodeError:
                        fail_count += 1

                except Exception as e:
                    print(f"Error generating for chunk: {e}")
                    fail_count += 1

        # Save QA Output
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, "w", encoding="utf-8") as out_file:
            json.dump(qa_results, out_file, indent=4, ensure_ascii=False)

        end_time = time.time()
        elapsed_time = timedelta(seconds=end_time - start_time)

        # Save Results and Checkpoint
        results_df.loc[len(results_df)] = [
            chunk_size, questions_num, qa_count_mismatch, total_questions,
            max_tokens, total_chunks, success_count, fail_count, str(elapsed_time)
        ]
        
        csv_output_path = f"Generated_Results/LLAMA3_1/{substring_date}/results_log.csv"
        results_df.to_csv(csv_output_path, index=False)

        pd.DataFrame([[chunk_size, max_tokens]], columns=["chunk_size", "max_tokens"]) \
            .to_csv(checkpoint_path, mode='a', header=False, index=False)

        print(f"✅ Saved: {output_file_path} | Time: {elapsed_time}")

print("All runs completed.")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Starting new run: Chunk=128, Max Tokens=256


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

✅ Saved: Generated_Results/LLAMA3_1/20250503/generation_log_128_Token_256_Q2.json | Time: 0:01:58.386235
All runs completed.


# Power Analysis

### Similarity scores

In [None]:
#pip install rouge-score - https://github.com/google-research/google-research/tree/master/rouge
#pip install bert-score - https://github.com/Tiiiger/bert_score 
# https://huggingface.co/spaces/evaluate-metric/bertscore
# https://huggingface.co/tasks/sentence-similarity

import evaluate
from sentence_transformers import SentenceTransformer, util
import numpy as np

# 1 Metric: ROUGE
rouge = evaluate.load("rouge")

originals = []
generations = []

for doc in qa_results.values():
    for item in doc:
        chunk = item["chunk"]
        qa = item["qa_pairs"]
        for pair in qa:
            originals.append(chunk)  # reference
            generations.append(pair["answer"])  # model-generated answer

scores = rouge.compute(predictions=generations, references=originals)
print(f"ROUGE Scores: {scores}")

# 2 Metric: BERTScore
bertscore = evaluate.load("bertscore")
bert_scores = bertscore.compute(predictions=generations, references=originals, model_type="bert-base-uncased", lang="en")
P = bert_scores["precision"]
R = bert_scores["recall"]
F1 = bert_scores["f1"] 

print(f"BERTScore: {bert_scores}")

# 3 Metric: STS (Semantic Textual Similarity)
sts_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sts_scores = []
for original, generated in zip(originals, generations):
    original_embedding = sts_model.encode(original, convert_to_tensor=True)
    generated_embedding = sts_model.encode(generated, convert_to_tensor=True)
    cosine_similarity = util.pytorch_cos_sim(original_embedding, generated_embedding).item()
    sts_scores.append(cosine_similarity)

print(f"STS Scores: {sts_scores}")

# save the scores to a CSV file
scores_df = pd.DataFrame({
    "chunk_size": [chunk_sizes[0]],
    "max_tokens": [max_token_list[0]],
    "questions_num": [questions_num],
    "rouge1": [scores["rouge1"]],
    "rouge2": [scores["rouge2"]],
    "rougeL": [scores["rougeL"]],
    "rougeLsum": [scores["rougeLsum"]],
    "bert_score_P": [np.mean(P)],
    "bert_score_R": [np.mean(R)],
    "bert_score_F1": [np.mean(F1)],
    "sts_score": [np.mean(sts_scores)]
})
scores_df.to_csv(f"Generated_Results/LLAMA3_1/{substring_date}/scores.csv", index=False)
print("Scores saved to scores.csv")




ROUGE Scores: {'rouge1': np.float64(0.2508211728930152), 'rouge2': np.float64(0.15723486213027013), 'rougeL': np.float64(0.21408449656826783), 'rougeLsum': np.float64(0.24702673196270564)}
BERTScore: {'precision': [0.5546076893806458, 0.6222397089004517, 0.8201026320457458, 0.8014022707939148, 0.9216362237930298, 0.8989461064338684, 0.41739416122436523, 0.413485586643219, 0.5343936681747437, 0.861682116985321, 0.6062642335891724, 0.7044219970703125, 0.9103200435638428, 0.9187355041503906, 0.45303377509117126, 0.5083460211753845], 'recall': [0.4855918884277344, 0.48190203309059143, 0.8762295246124268, 0.4358558654785156, 0.5194042921066284, 0.4343598783016205, 0.2834673821926117, 0.30328941345214844, 0.21217963099479675, 0.4816848337650299, 0.29829058051109314, 0.48668134212493896, 0.47884076833724976, 0.5898059606552124, 0.34728050231933594, 0.3624058961868286], 'f1': [0.5178102850914001, 0.5431523323059082, 0.8472374677658081, 0.5646288990974426, 0.6643835306167603, 0.5857112407684326