In [None]:
# Change the directory to the Tables folder
TABLE_DIR = '../Tables/'
FIG_DIR = '../Figures/'

In [None]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
import seaborn as sns

This notbook illustrates a clear correlation between a model's benchmark performance and the base model. We will look at models on open LLM leaderboard based on three pretrained models: Qwen2.5 0.5B, 7B and 14B

In [None]:
# Base models that we will look at
frequent_base_models_new = ['llama-3.1-8b', 'qwen2.5-14b', 'qwen2.5-7b', 'qwen2.5-0.5b', 'mistral-7b', 'gemma-2-9b']
# Benchmarks included in the leaderboard
cols_to_transform_new = ['IFEval', 'BBH', 'MATH Lvl 5', 'GPQA', 'MUSR', 'MMLU-PRO']
# Load the leaderboard
df_filtered_new = pd.read_csv(TABLE_DIR + 'open_llm_leaderboard_with_token_size.csv')

In [None]:
models = ['qwen2.5-14b', 'qwen2.5-7b', 'qwen2.5-0.5b']
benchmarks = ['IFEval', 'BBH', 'MATH Lvl 5', 'GPQA', 'MUSR', 'MMLU-PRO']
palette = sns.color_palette("Spectral", len(benchmarks))

bar_width = 0.15  # Width of each bar
index = np.arange(len(models))  # X-axis positions for the groups

fig, ax = plt.subplots(figsize=(12, 6))

for i, benchmark in enumerate(benchmarks):
    benchmark_data = [df_filtered_new[df_filtered_new['fullname'].str.lower().str.contains(model)][benchmark].mean() for model in models]
    offset = bar_width * i
    ax.bar(index + offset, benchmark_data, bar_width, label=benchmark, color=palette[i])

# Add grid lines and customize appearance
ax.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal grid lines
ax.spines['top'].set_visible(False)  # Remove top spine
ax.spines['right'].set_visible(False)  # Remove right spine

ax.set_xlabel('Models', fontsize = 15)
ax.set_ylabel('Average Accuracy', fontsize = 15)
# ax.set_title('Average Accuracy across Benchmarks for Different Qwen Models')
ax.set_xticks(index + bar_width * (len(benchmarks) / 2))  # Center the x-axis ticks
ax.set_xticklabels(models)
ax.legend(fontsize=12, frameon=False)

plt.savefig(TABLE_DIR + "Qwen_perf_compare.png", dpi = 300)