In [3]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should return the number of GPUs
print(torch.cuda.get_device_name(0))  # Should show the GPU model

True
1
NVIDIA RTX 5000 Ada Generation


In [4]:
import transformers
import torch
import os
import json
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging
import time
from datetime import timedelta, datetime
import pandas as pd
from dotenv import load_dotenv
import shutil 

import evaluate
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load environment variables
load_dotenv(dotenv_path="../../.env") # path is relative to this script, adjust as needed


run_id = "LMForge_RUN01"  # <- Change this manually for each experiment

In [5]:
# setting huggingface token
login(token=os.getenv("HUGGINGFACE_TOKEN"))

os.environ["HF_HOME"] = "D:/huggingface_cache" 
os.environ["TRANSFORMERS_CACHE"] = "D:/huggingface_cache"
os.environ["HUGGINGFACE_HUB_CACHE"] = "D:/huggingface_cache"

print("HF_HOME:", os.getenv("HF_HOME"))
print("TRANSFORMERS_CACHE:", os.getenv("TRANSFORMERS_CACHE"))
print("HUGGINGFACE_HUB_CACHE:", os.getenv("HUGGINGFACE_HUB_CACHE"))

transformers.utils.hub.TRANSFORMERS_CACHE = "D:/huggingface_cache"

HF_HOME: D:/huggingface_cache
TRANSFORMERS_CACHE: D:/huggingface_cache
HUGGINGFACE_HUB_CACHE: D:/huggingface_cache


In [6]:
model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cuda")


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
chunk_sizes = [128]#,256, 512, 1024]
questions_num = 2
max_token_list = [128]#,256,512,1024,2048]


In [8]:
results_df = pd.DataFrame(columns=[
    "chunk_size", "questions_num", "qa_count_mismatch", "total_questions", "token_Size",
    "total_chunks", "success_count", "fail_count",
    "elapsed_time"
])

In [9]:
BATCH_SAVE_INTERVAL = 10
checkpoint_path = "checkpoint.csv"

# Loading an existing checkpoint if it exists
if os.path.exists(checkpoint_path):
    completed_runs = pd.read_csv(checkpoint_path)
    completed_set = set(tuple(row) for row in completed_runs.values)
else:
    completed_set = set()
    pd.DataFrame(columns=["chunk_size", "max_tokens", "doc_name", "chunk_index"]).to_csv(checkpoint_path, index=False)
pd.DataFrame(columns=["chunk_size", "max_tokens", "doc_name", "chunk_index"]).to_csv(checkpoint_path, index=False)

## POWER ANALYSIS
##### This function performs a power analysis for generated text against a reference text.

In [10]:
def power_analysis(chunk_size, max_tokens, qa_results,elapsed_time):
    """
    Perform power analysis based on the provided parameters for the current run.
    """
    
    # https://huggingface.co/spaces/evaluate-metric/bertscore
    # https://huggingface.co/tasks/sentence-similarity
    # 1 Metric: ROUGE
    rouge = evaluate.load("rouge")

    originals = []
    generations = []

    for doc in qa_results.values():
        for item in doc:
            chunk = item["chunk"]
            qa = item["qa_pairs"]
            for pair in qa:
                originals.append(chunk)  # reference
                generations.append(pair["answer"])  # model-generated answer

    scores = rouge.compute(predictions=generations, references=originals)
    print(f"ROUGE Scores: {scores}")

    # 2 Metric: BERTScore
    bertscore = evaluate.load("bertscore")
    bert_scores = bertscore.compute(predictions=generations, references=originals, model_type="bert-base-uncased", lang="en")
    P = bert_scores["precision"]
    R = bert_scores["recall"]
    F1 = bert_scores["f1"] 

    print(f"BERTScore: {bert_scores}")

    # 3 Metric: STS (Semantic Textual Similarity)
    sts_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    original_embeddings = sts_model.encode(originals, convert_to_tensor=True) 
    generated_embeddings = sts_model.encode(generations, convert_to_tensor=True)
    sts_scores = util.pytorch_cos_sim(original_embeddings, generated_embeddings).diagonal().cpu().tolist()

    print(f"STS Scores: {sts_scores}")

    # save the scores to a CSV file
    scores_df = pd.DataFrame({
        "chunk_size": [chunk_size],
        "max_tokens": [max_tokens],
        "questions_num": [questions_num],
        "rouge1": [scores["rouge1"]],
        "rouge2": [scores["rouge2"]],
        "rougeL": [scores["rougeL"]],
        "rougeLsum": [scores["rougeLsum"]],
        "bert_score_P": [np.mean(P)],
        "bert_score_R": [np.mean(R)],
        "bert_score_F1": [np.mean(F1)],
        "sts_score": [np.mean(sts_scores)],
        "elapsed_time": [elapsed_time],
    })
    
    print("Scores saved to scores.csv")   
    return scores_df

# Convert logs to Panda


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# delete existing folder if you want to rerun the script from scratch uncomment the following lines

if os.path.exists(f"Generated_Results/LLAMA3_1/{run_id}"):
    shutil.rmtree(f"Generated_Results/LLAMA3_1/{run_id}")

# deleting existing checkpoint file if you want to rerun the script from scratch uncomment the following lines
if os.path.exists(checkpoint_path):
    os.remove(checkpoint_path)
    
for chunk_size in chunk_sizes:
    json_file_path = f"../Yaman/Generate_Paragraphs/Results/extracted_chunks_{chunk_size}_overlap.json"

    if not os.path.exists(json_file_path):
        print(f"Missing input file: {json_file_path}, skipping.")
        continue

    with open(json_file_path, "r", encoding="utf-8") as file:
        chunk_data = json.load(file)


    for max_tokens in max_token_list:
        output_file_path = f"Generated_Results/LLAMA3_1/{run_id}/generation_log_{chunk_size}_Token_{max_tokens}_Q{questions_num}.json"
        if os.path.exists(output_file_path):
            print(f"Output already exists: {output_file_path}, skipping.")
            continue

        print(f"Starting new run: Chunk={chunk_size}, Max Tokens={max_tokens}")

        qa_results = {}
        total_chunks = 0
        success_count = 0
        fail_count = 0
        total_questions = 0
        qa_count_mismatch = 0
        batch_checkpoints = []
        batch_qa_results = {}
        start_time = time.time()

        for doc_name, chunks in chunk_data.items():
            if doc_name not in qa_results:
                qa_results[doc_name] = []
            if doc_name not in batch_qa_results:
                batch_qa_results[doc_name] = []

            for chunk_index, chunk in enumerate(chunks[:100]):  # Limit to 1000 chunks for testing
                run_key = (chunk_size, max_tokens, doc_name, chunk_index)
                if run_key in completed_set:
                    print(f"Skipping completed sample: {run_key}")
                    continue

                total_chunks += 1

                prompt = f"""
Generate {questions_num} question-answer pairs based on the following text segment. 
Return the result in valid JSON format as a list of objects.

Text Segment:

{chunk}

Response Format:
[
    {{"question": "generated question", "answer": "generated Answer"}},
]

Question answers should be at least 250 words long.

Do NOT include any explanation or preamble before or after the JSON output.
Return ONLY valid JSON output.

Answer:
                """

                inputs = tokenizer(prompt, return_tensors="pt").to(device)

                try:
                    max_context = model.config.max_position_embeddings
                    input_len = inputs['input_ids'].shape[1]
                    if input_len + max_tokens > max_context:
                        print(f"Skipping chunk (too long): input_len={input_len}")
                        continue

                    with torch.no_grad():
                        output_tokens = model.generate(**inputs, max_new_tokens=max_tokens,pad_token_id=tokenizer.eos_token_id)

                    generated_tokens = output_tokens[0][len(inputs["input_ids"][0]):]
                    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

                    try:
                        qa_pairs = json.loads(generated_text)
                        if isinstance(qa_pairs, list):
                            sample_result = {
                                "chunk": chunk,
                                "qa_pairs": qa_pairs
                            }
                            qa_results[doc_name].append(sample_result)
                            batch_qa_results[doc_name].append(sample_result)
                            success_count += 1
                            total_questions += len(qa_pairs)

                            if len(qa_pairs) != questions_num:
                                qa_count_mismatch += 1
                        else:
                            fail_count += 1
                    except json.JSONDecodeError:
                        fail_count += 1

                except Exception as e:
                    print(f"Error generating for chunk: {e}")
                    fail_count += 1
                    
                completed_set.add(run_key)
                batch_checkpoints.append(run_key)

                if len(batch_checkpoints) >= BATCH_SAVE_INTERVAL:
                    output_file_path = f"Generated_Results/LLAMA3_1/{run_id}/generation_log_{chunk_size}_Token_{max_tokens}_Q{questions_num}.json"
                    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

                    if os.path.exists(output_file_path):
                        with open(output_file_path, "r", encoding="utf-8") as f:
                            existing_results = json.load(f)
                    else:
                        existing_results = {}

                    # Merge existing with new
                    for k, v in batch_qa_results.items():
                        if k in existing_results:
                            existing_results[k].extend(v)
                        else:
                            existing_results[k] = v

                    with open(output_file_path, "w", encoding="utf-8") as out_file:
                        json.dump(existing_results, out_file, indent=4, ensure_ascii=False)

                    # Save checkpoint
                    pd.DataFrame(batch_checkpoints, columns=["chunk_size", "max_tokens", "doc_name", "chunk_index"]) \
                        .to_csv(checkpoint_path, mode='a', header=False, index=False)

                    # Save results log incrementally
                    elapsed_time = timedelta(seconds=time.time() - start_time)
                    results_row = [
                        chunk_size, questions_num, qa_count_mismatch, total_questions,
                        max_tokens, total_chunks, success_count, fail_count, str(elapsed_time)
                    ]
                    csv_output_path = f"Generated_Results/LLAMA3_1/{run_id}/results_log.csv"
                    if not os.path.exists(csv_output_path):
                        results_df.columns = [
                            "chunk_size", "questions_num", "qa_count_mismatch", "total_questions",
                            "max_tokens", "total_chunks", "success_count", "fail_count", "elapsed_time"
                        ]
                        results_df.loc[0] = results_row
                        results_df.to_csv(csv_output_path, index=False)
                    else:
                        pd.DataFrame([results_row], columns=results_df.columns).to_csv(csv_output_path, mode='a', header=False, index=False)

                    # Clear batch
                    batch_checkpoints = []
                    batch_qa_results = {}

        # Save QA Output
        if batch_checkpoints:
            if os.path.exists(output_file_path):
                with open(output_file_path, "r", encoding="utf-8") as f:
                    existing_results = json.load(f)
            else:
                existing_results = {}

            for k, v in batch_qa_results.items():
                if k in existing_results:
                    existing_results[k].extend(v)
                else:
                    existing_results[k] = v

            with open(output_file_path, "w", encoding="utf-8") as out_file:
                json.dump(existing_results, out_file, indent=4, ensure_ascii=False)

            pd.DataFrame(batch_checkpoints, columns=["chunk_size", "max_tokens", "doc_name", "chunk_index"]) \
                .to_csv(checkpoint_path, mode='a', header=False, index=False)

            elapsed_time = timedelta(seconds=time.time() - start_time)
            results_row = [
                chunk_size, questions_num, qa_count_mismatch, total_questions,
                max_tokens, total_chunks, success_count, fail_count, str(elapsed_time)
            ]
            results_columns = [
                "chunk_size", "questions_num", "qa_count_mismatch", "total_questions",
                "max_tokens", "total_chunks", "success_count", "fail_count", "elapsed_time"
            ]
            results_csv_path = f"Generated_Results/LLAMA3_1/{run_id}/results_log.csv"
            os.makedirs(os.path.dirname(results_csv_path), exist_ok=True)
            if not os.path.exists(results_csv_path):
                pd.DataFrame([results_row], columns=results_columns).to_csv(results_csv_path, index=False)
            else:
                pd.DataFrame([results_row], columns=results_columns).to_csv(results_csv_path, mode='a', header=False, index=False)

            # Power analysis logging for final flush
            scores_df = power_analysis(chunk_size, max_tokens, batch_qa_results, run_id, elapsed_time)
            timestamp_dir = f"Generated_Results/LLAMA3_1/{run_id}/scores"
            os.makedirs(timestamp_dir, exist_ok=True)
            scores_path = f"{timestamp_dir}/scores/scores.csv"
            os.makedirs(os.path.dirname(scores_path), exist_ok=True)
            if os.path.exists(scores_path):
                scores_df.to_csv(scores_path, mode='a', header=False, index=False)
            else:
                scores_df.to_csv(scores_path, index=False)
        else:
            print("No batches to save.")
        # Final power analysis
    
print("All runs completed.")

Starting new run: Chunk=128, Max Tokens=128
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating for chunk: 'Documents/Legal Aspects of Corporate Management and Finance.pdf'
Error generating

In [None]:

# import pandas as pd

# def convert_to_seconds(time_str):
# 	try:
# 		minutes, seconds = map(float, time_str.split(":"))
# 		return minutes * 60 + seconds
# 	except ValueError:
# 		return None


# df_scores = pd.read_csv(f"elapsed_time.csv")
# df_scores["elapsed_time_seconds"] = df_scores["elapsed_time"].apply(convert_to_seconds)
# df_scores["time_per_sample"] = df_scores["elapsed_time_seconds"] / df_scores["samples"]

# grouped_estimates = df_scores.groupby(["chunk_size", "max_tokens"])["time_per_sample"].mean().reset_index()

# grouped_estimates["time_for_100"] = grouped_estimates["time_per_sample"] * 100
# grouped_estimates["time_for_1000"] = grouped_estimates["time_per_sample"] * 1000
# grouped_estimates["time_for_5000"] = grouped_estimates["time_per_sample"] * 5000
# grouped_estimates.to_csv("average_time_estimates.csv", index=False)

