In [None]:
from llama_cpp import Llama
import time
import torch
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
import psutil
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from tqdm import tqdm
import ast
import re
import pandas as pd

In [None]:
df_pytorch = pd.read_csv("3_4_Data/pytorch_Qwen2-0.5B-Instruct.csv")
df_llama = pd.read_csv('3_4_Data/llamacpp_Qwen2-0.5B-Instruct-GGUF.csv')

ram_dict = {"PyTorch": df_pytorch["max_ram"].mean(), "LLAMA": df_llama["max_ram"].mean()}
cpu_dict = {"PyTorch": df_pytorch["cpu_usage"].mean(), "LLAMA": df_llama["cpu_usage"].mean()}
latency_dict = {"PyTorch": df_pytorch["latency"].mean(), "LLAMA": df_llama["latency"].mean()}
throughput_dict = {"PyTorch": df_pytorch["throughput"].mean(), "LLAMA": df_llama["throughput"].mean()}

ram_dict, cpu_dict, latency_dict, throughput_dict

In [None]:
df_pytorch = pd.read_csv("3_4_Data/batch_size_benchmark_Qwen2-0.5B-Instruct.csv")
df_llama = pd.read_csv('3_4_Data/batch_size_benchmark_Qwen2-1.5B-Instruct.csv')

ram_dict = {"PyTorch": df_pytorch["max_ram"].mean(), "LLAMA": df_llama["max_ram"].mean()}
cpu_dict = {"PyTorch": df_pytorch["cpu_usage"].mean(), "LLAMA": df_llama["cpu_usage"].mean()}
latency_dict = {"PyTorch": df_pytorch["latency"].mean(), "LLAMA": df_llama["latency"].mean()}
throughput_dict = {"PyTorch": df_pytorch["throughput"].mean(), "LLAMA": df_llama["throughput"].mean()}

ram_dict, cpu_dict, latency_dict, throughput_dict

<h1> Plotting Data </h1>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_dumbbell_charts(model_names, metrics, model_data, units=None):
    """
    Creates a vertical panel of dumbbell charts, one per metric.
    Each metric has an independent horizontal axis and two points:
    one for each model, connected by a line.
    
    Parameters:
    - model_names: list of str, e.g. ['PyTorch', 'LLAMA']
    - metrics: list of str, e.g. ['Max RAM', 'CPU Utilization', 'Latency', 'Throughput']
    - model_data: dict of dict, where model_data[model][metric] = numeric value
    - units: optional list of str, same length as metrics, e.g. ['Gb', '%', 's', 'Tokens/s']
    """
    sns.set_style("whitegrid")  # cleaner grid style
    
    n_metrics = len(metrics)
    
    # Create a vertical set of subplots (1 column, n_metrics rows)
    fig, axes = plt.subplots(
        nrows=n_metrics, ncols=1, 
        figsize=(8, 2.5 * n_metrics),  # Adjust height as needed
        dpi=350
    )

    # Ensure axes is always iterable (even if n_metrics=1)
    if n_metrics == 1:
        axes = [axes]

    for i, metric in enumerate(metrics):
        ax = axes[i]
        
        # Retrieve data for both models
        py_val = model_data[model_names[0]][metric]
        llama_val = model_data[model_names[1]][metric]
        
        # Build a small list of the two points
        x_values = [py_val, llama_val]
        
        # For x-axis label, incorporate unit if available
        if units and i < len(units) and units[i]:
            metric_label = f"{metric} ({units[i]})"
        else:
            metric_label = metric
        
        # We'll place them on a single y-level, e.g., y=0
        # Then draw a line between them
        ax.plot(
            x_values, [0, 0], 
            color="gray", linewidth=2, 
            zorder=1
        )
        
        # Plot each point separately with distinct colors
        ax.scatter(
            py_val, 0, 
            color="#66b3ff", edgecolor="black", s=100, 
            label=model_names[0], zorder=2
        )
        ax.scatter(
            llama_val, 0, 
            color="#ff9999", edgecolor="black", s=100, 
            label=model_names[1], zorder=2
        )
        
        # Annotate each point with its numeric value
        # Slight horizontal offset so text doesn't collide
        ax.text(
            py_val, 0.05, 
            f"{py_val:.2f}", 
            ha="center", va="bottom", 
            fontsize=10
        )
        ax.text(
            llama_val, -0.05, 
            f"{llama_val:.2f}", 
            ha="center", va="top", 
            fontsize=10
        )
        
        # Set title on the left side (or top). We'll just use set_title for clarity.
        ax.set_title(metric_label, fontsize=12, loc="left", pad=10)
        
        # Hide the y-axis, since we only care about the horizontal scale
        ax.set_yticks([])
        ax.set_ylabel("")
        
        # Optionally show a minimal x-axis
        ax.tick_params(axis="x", labelsize=10)
        
        # If you want a legend, you can create one on the last subplot or overall
        if i == 0:  # put a legend on the first subplot
            ax.legend(loc="upper left", frameon=False, fontsize=9)
    
    plt.tight_layout()
    plt.show()


model_data = {
    "PyTorch": {
        "Max RAM": ram_dict["PyTorch"],
        "CPU Utilization": cpu_dict["PyTorch"],
        "Latency": latency_dict["PyTorch"],
        "Throughput": throughput_dict["PyTorch"]
    },
    "LLAMA": {
        "Max RAM": ram_dict["LLAMA"],
        "CPU Utilization": cpu_dict["LLAMA"],
        "Latency": latency_dict["LLAMA"],
        "Throughput": throughput_dict["LLAMA"]
    }
}

model_names = ["PyTorch", "LLAMA"]
metrics = ["Max RAM", "CPU Utilization", "Latency", "Throughput"]
units = ["Gb", "%", "s", "Tokens/s"]  # Provide units if necessary

# Plot the 2x2 grid of metrics
plot_dumbbell_charts(model_names, metrics, model_data, units)

In [None]:
ordered_model_names = [
    'Qwen2 0.5B',
    'Llama 3.2 1B',
    'R1-Distill-Qwen 1.5B',
    'Qwen2 1.5B',
    'Llama 3.2 3B',
    'R1-Distill-Qwen 7B',
    'Qwen2 7B',
    'Llama 3.1 8B',
    'R1-Distill-Llama 8B'
]

metrics = ['Accuracy', 'Latency', 'Throughput', 'Max RAM', 'Tokens Generated']
units = ["", "s", "tokens/s", "GB", ""]

In [None]:
def plot_model_metrics(model_names, metrics, units, model_data):
    """
    Plots bar graphs for each metric across the provided model names.

    Parameters:
    - model_names: list of str, the names of the models to include.
    - metrics: list of str, the metrics to plot (e.g., 'Latency', 'Throughput').
    - units: list of str, the corresponding units for each metric.
    - model_data: dict, where keys are model names and values are dictionaries 
                  containing the metric values.
    """
    
    for metric, unit in zip(metrics, units):
        # Extract metric values for the models (ignore models missing the metric)
        unit_str = f" ({unit})" if unit else ""
        values = []
        selected_models = []
        for model in model_names:
            if model in model_data and metric in model_data[model]:
                values.append(model_data[model][metric])
                selected_models.append(model)

        # Create a new figure for each metric with 350 dpi for publication quality
        plt.figure(figsize=(8, 5), dpi=350)

        # Plot bars with a custom color, bar width, and edge color
        bars = plt.bar(
            selected_models, 
            values, 
            color='#66b3ff',   # Light blue
            width=0.6         # Adjust as needed for thickness
        )
        
        # Add a title and axis labels with increased font sizes
        plt.title(f"{metric}{unit_str}", fontsize=16, pad=10)
        # plt.xlabel("Models", fontsize=14)
        plt.ylabel(f"{metric}{unit_str}", fontsize=14)

        # Rotate x-axis labels for clarity with increased font size
        plt.xticks(rotation=45, ha='right', fontsize=12)
        plt.yticks(fontsize=12)

        # Add numeric labels on top of each bar with increased font size
        for bar in bars:
            height = bar.get_height()
            plt.text(
                bar.get_x() + bar.get_width() / 2,  # X position in center of bar
                height,                            # Y position just above the bar
                f"{height:.2f}",                   # Format to 2 decimal places
                ha='center', va='bottom', 
                fontsize=12
            )
        
        # Make layout fit nicely
        plt.tight_layout()
        plt.show()





model_names = [
    'Qwen2 1.5B',
    'R1-Distill-Qwen 1.5B',
    'Qwen2 7B',
    'R1-Distill-Qwen 7B',
    'Llama 3.1 8B',
    'R1-Distill-Llama 8B'
]

metrics = ['Accuracy', 'Latency', 'Throughput', 'Max RAM', 'Tokens Generated']
units = ["", "s", "tokens/s", "GB", ""]
plot_model_metrics(model_names, metrics, units, final_model_results)

In [None]:
model_names = [
    'Qwen2 0.5B',
    'Llama 3.2 1B',
    'Qwen2 1.5B',
    'Llama 3.2 3B',
    'Qwen2 7B',
    'Llama 3.1 8B',
]

metrics = ['Accuracy', 'Latency', 'Throughput', 'Max RAM']
units = ["", "s", "tokens/s", "GB"]
plot_model_metrics(model_names, metrics, units, final_model_results)

<h1> Time Analysis </h1>

In [None]:
def answer_position(model_output_whole, answer_key):
    """
    Evaluates the model output by scanning the entire string for the answer_key.
    
    For numeric answer_keys, it searches for any number (allowing an optional '$' and commas)
    that, when converted to a float, equals answer_key. For non-numeric answer_keys, it checks
    whether the answer_key appears anywhere in the string.
    
    Returns:
        1 if a matching answer is found, 0 otherwise.
    """
    # Determine whether answer_key is numeric.
    model_output_list = model_output_whole.split(" ")
    for index1 in range(len(model_output_list)):
        if answer_key in model_output_list[index1]:
            return [index1, len(model_output_list)]
    return [float('inf'), len(model_output_list)]

In [None]:
list_of_model_outputs_to_study = ['DeepSeek-R1-Distill-Llama-8B-Q4_K_M_gguf.txt',
 'DeepSeek-R1-Distill-Qwen-1_5B-Q4_K_M_gguf.txt',
 'DeepSeek-R1-Distill-Qwen-7B-Q6_K_gguf.txt',
 'Llama-3_2-1B-Instruct_Q4_K_M_gguf.txt',
 'Llama-3_2-3B-Instruct-Q4_K_M_gguf.txt',
 'Meta-Llama-3_1-8B-Instruct-Q4_K_M_gguf.txt',
 'qwen2-0_5b-instruct-q4_k_m_gguf.txt',
 'qwen2-1_5b-instruct-q4_k_m_gguf.txt',
 'qwen2-7b-instruct-q4_k_m_gguf.txt']

file_name_to_model_name = {
    'Llama-3_2-1B-Instruct_IQ1_M_gguf.txt': 'Llama 3.2 1B 1-bit',
    'Llama-3_2-1B-Instruct_Q2_K_gguf.txt': 'Llama 3.2 1B 2-bit',
    'Llama-3_2-1B-Instruct_Q4_K_M_gguf.txt': 'Llama 3.2 1B 4-bit',
    'Llama-3_2-1B-Instruct_Q8_0_gguf.txt': 'Llama 3.2 1B 8-bit',
    'Llama-3_2-3B-Instruct_IQ1_M_gguf.txt': 'Llama 3.2 3B 1-bit',
    'Llama-3_2-3B-Instruct_Q2_K_gguf.txt': 'Llama 3.2 3B 2-bit',
    'Llama-3_2-3B-Instruct_Q4_K_M_gguf.txt': 'Llama 3.2 3B 4-bit',
    'Llama-3_2-3B-Instruct_Q8_0_gguf.txt': 'Llama 3.2 3B 8-bit',
    'Meta-Llama-3-8B-Instruct_IQ1_M_gguf.txt': 'Llama 3.0 8B 1-bit',
    'Meta-Llama-3-8B-Instruct_IQ2_XS_gguf.txt': 'Llama 3.0 8B 2-bit',
    'Meta-Llama-3-8B-Instruct_Q4_K_S_gguf.txt': 'Llama 3.0 8B 4-bit',
    'Meta-Llama-3-8B-Instruct_Q8_0_gguf.txt': 'Llama 3.0 8B 8-bit'
}

In [None]:
def read_model_outputs(model_name):
    """
    Reads the output file for the given model and returns a list of dictionaries,
    each representing the results for one prompt.

    Args:
        model_name (str): The original model name (e.g., "some/model").

    Returns:
        list: A list of dictionaries read from the file.
    """
    # Sanitize the model name (replace "/" with "_") to match the file naming
    model_name_sanitized = model_name.replace("/", "_")
    file_path = f"model_outputs_quant/{model_name_sanitized}"
    
    results = []
    
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"No output file found for model '{model_name_sanitized}'.")
        print(f"No output file found for model '{model_name}'.")
        return results
    
    # Open and read the file line by line
    with open(file_path, "r", encoding="utf-8") as infile:
        for line in infile:
            line = line.strip()
            if line:
                try:
                    # Convert the string representation of the dictionary into a dict
                    result_dict = ast.literal_eval(line)
                    results.append(result_dict)
                except Exception as e:
                    print(f"Error parsing line: {line}\nError: {e}")
                    
    return results

In [None]:
final_model_results = {}
for model_index in range(len(list_of_model_outputs_to_study)):
    data_out = read_model_outputs(list_of_model_outputs_to_study[model_index])
    print(f"Model: {file_name_to_model_name[list_of_model_outputs_to_study[model_index]]}")
    model_results = []
    answer_positions = []
    correct_answers = 0
    count = 0
    for i in tqdm(range(int(len(data_out)))):
        model_output = data_out[i]['model_output'] 
        answer_key = data_out[i]['answer']
        values_found = answer_position(model_output, "{:g}".format(answer_key))
        if values_found[0] != float('inf'):
            correct_answers += 1
        latency = data_out[i]['latency'] 
        throughput = data_out[i]['throughput']
        time_to_answer = values_found[0]*latency/values_found[1] 
        count += 1
        model_results = model_results + [time_to_answer]
        answer_positions = answer_positions + [values_found[0]]
    final_model_results[file_name_to_model_name[list_of_model_outputs_to_study[model_index]]] = {"model_results": model_results, "accuracy" : correct_answers/count, "answer_positions": answer_positions}

In [None]:
def plot_multiple_cdfs_line_chart(datasets, labels=None, figsize=(8, 5), max_x=None):
    """
    Plot the CDFs for multiple 'time_to_answers' datasets as a line chart.
    
    For each dataset, the CDF is computed as the fraction of all values (including infinities)
    that are less than or equal to a given value. The plot is drawn only over the finite portion
    of the data (up to the dataset's maximum finite value) and then extended horizontally to a 
    specified maximum x value (either provided via 'max_x' or determined as the global maximum 
    finite value across all datasets). This clearly indicates that the CDF does not reach 1 
    when there are infinite values.
    
    Parameters:
        datasets (list of lists/arrays): Each element is a dataset of numbers.
        labels (list of str, optional): Labels for the datasets. If not provided, default names 
            'Dataset 1', 'Dataset 2', etc. will be used.
        figsize (tuple, optional): Figure size (default is (8, 5)).
        max_x (float, optional): The maximum x value to plot. If provided, each dataset's line will 
            be extended horizontally to this value. If not provided, the maximum finite value across 
            all datasets is used.
    """
    
    # Create a new figure with publication quality dpi
    plt.figure(figsize=figsize, dpi=350)
    
    # Compute the global maximum finite value across all datasets.
    global_max_finite = None
    for d in datasets:
        d_arr = np.array(d)
        finite_data = d_arr[np.isfinite(d_arr)]
        if finite_data.size == 0:
            raise ValueError("At least one dataset has all infinite values.")
        d_max = np.max(finite_data)
        if global_max_finite is None or d_max > global_max_finite:
            global_max_finite = d_max
            
    # Determine the x-axis maximum for plotting.
    plot_max = max_x if max_x is not None else global_max_finite

    if labels is None:
        labels = [f"Dataset {i+1}" for i in range(len(datasets))]
    
    for data, label in zip(datasets, labels):
        data = np.array(data)
        n_total = len(data)
        
        # Sort the entire dataset (infinities will appear at the end)
        sorted_data = np.sort(data)
        
        # Compute the CDF including all points (infs included, so CDF may not reach 1)
        cdf = np.arange(1, n_total + 1) / n_total
        
        # Identify the finite portion (all values <= dataset's maximum finite value)
        finite_data = data[np.isfinite(data)]
        dataset_max_finite = np.max(finite_data)
        mask = sorted_data <= dataset_max_finite
        
        # Extract the finite portion to plot
        x_plot = sorted_data[mask]
        y_plot = cdf[mask]
        
        # Extend the line horizontally to the plot_max value if needed.
        if dataset_max_finite < plot_max:
            x_plot = np.append(x_plot, plot_max)
            y_plot = np.append(y_plot, y_plot[-1])
        
        # Plot as a line chart with markers at the data points.
        plt.plot(x_plot, y_plot, label=label, marker='o')
    
    # Set labels and title with updated font sizes.
    plt.xlabel('Time (in seconds)', fontsize=14)
    plt.ylabel('Accuracy', fontsize=14)
    plt.title('Accuracy vs Inference Time', fontsize=16, pad=10)
    
    # Set tick font sizes
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    
    plt.grid(True)
    plt.xlim(0, plot_max)
    plt.legend(fontsize=12)
    plt.show()


In [None]:
models_to_plot = [
 'Llama 3.2 1B',
 'Llama 3.2 3B',
 'Llama 3.1 8B',
 ]

list_of_datasets = []
list_of_labels = []
for key1 in models_to_plot:
    list_of_datasets.append(final_model_results[key1]["model_results"])
    list_of_labels.append(key1)
plot_multiple_cdfs_line_chart(list_of_datasets, list_of_labels, figsize=(8, 5), max_x = None)

In [None]:
models_to_plot = [
 'Qwen2 0.5B',
 'Qwen2 1.5B',
 'Qwen2 7B']

list_of_datasets = []
list_of_labels = []
for key1 in models_to_plot:
    list_of_datasets.append(final_model_results[key1]["model_results"])
    list_of_labels.append(key1)
plot_multiple_cdfs_line_chart(list_of_datasets, list_of_labels, figsize=(8, 5), max_x = None)

In [None]:
models_to_plot = ['R1-Distill-Llama 8B',
'R1-Distill-Qwen 7B',
'Llama 3.1 8B',
'Qwen2 7B'
]

list_of_datasets = []
list_of_labels = []
for key1 in models_to_plot:
    list_of_datasets.append(final_model_results[key1]["model_results"])
    list_of_labels.append(key1)
plot_multiple_cdfs_line_chart(list_of_datasets, list_of_labels, figsize=(8, 5), max_x = None)