In [37]:
import pandas as pd
import os
import re

def extract_info_from_filename(filename):
    """Extract model, prompt_method, and sample_size from the file name."""
    # Extract the model directly from the start of the filename before the first '+'
    model = filename.split('_')[1].split('+')[0]
    prompt_method = filename.split('.')[0].split('+')[-1]
    ss = filename.split('+')[-2].split('_')[-1]

    return model, prompt_method, ss

def load_data(root_directory):
    """Load data from all CSV files in the directory structure."""
    all_data = []
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                try:
                    model, prompt_method, sample_size = extract_info_from_filename(file)
                    data = pd.read_csv(file_path)
                    data['model'] = model
                    data['prompt_method'] = prompt_method
                    data['sample_size'] = int(sample_size)
                    all_data.append(data)
                except ValueError as e:
                    print(f"Skipping file due to error: {e}")

    return pd.concat(all_data, ignore_index=True)

# Usage
root_directory = '/home/fox/Desktop/ThesisLLM/Coding/gpt4/4_response_extraction/csv'  # Change this to your directory path
big_dataframe = load_data(root_directory)
big_dataframe


Unnamed: 0,target_material,target_thickness,pulse_width,energy,spot_size,intensity,power,cutoff_energy,model,prompt_method,sample_size
0,aluminium,3.278416,615.500,454.200000,4.227937,2.172828e+20,1.134512e+14,12.479524,mistral-7b,ds,50
1,gold,6.263730,507.500,328.854289,2.414053,8.712614e+21,4.314484e+13,17.315095,mistral-7b,ds,50
2,plastic,1.123144,1.834,282.677678,3.301244,5.593274e+20,1.420118e+13,3.906686,mistral-7b,ds,50
3,plastic,3.548534,1.665,267.407598,3.314452,4.385261e+20,8.188767e+13,5.712183,mistral-7b,ds,50
4,plastic,2.211846,2.310,573.240795,3.300000,4.077722e+20,2.673356e+13,3.334818,mistral-7b,ds,50
...,...,...,...,...,...,...,...,...,...,...,...
102784,aluminium,2.352000,900.000,15.293000,3.300000,2.191000e+21,8.539000e+13,29.530000,llama3-70b,gk,5
102785,gold,1.233000,1200.000,12.192000,2.552000,7.509000e+20,4.379000e+14,33.730000,llama3-70b,gk,5
102786,polypropylene,6.542000,1500.000,7.509000,3.300000,1.639000e+21,4.851000e+14,40.420000,llama3-70b,gk,5
102787,plastic,0.552000,4500.000,2.382000,3.300000,5.591000e+20,2.191000e+14,15.290000,llama3-70b,gk,5


In [38]:
counts = big_dataframe.groupby(['model','prompt_method','sample_size']).count()['target_material']
big_dataframe.groupby(['model']).nunique()

Unnamed: 0_level_0,target_material,target_thickness,pulse_width,energy,spot_size,intensity,power,cutoff_energy,prompt_method,sample_size
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
claude-3-opus-20240229,11,1047,527,1689,614,2400,2558,671,3,6
claude-3-sonnet-20240229,115,4293,2975,6482,3744,7842,9718,3785,12,6
falcon-40b,24,90,63,93,75,115,127,94,12,3
gpt-3.5-turbo-0125,187,1875,1418,7274,2157,10590,10180,1648,1,6
gpt-4o,151,7164,7560,10797,5260,11129,11563,7212,12,6
llama3-70b,15,852,587,1014,303,958,1114,796,8,1
mistral-7b,105,1755,1728,2726,1535,2640,2934,1512,12,6
phi3-medium-128k,57,1984,1387,5724,1847,3517,6919,792,12,4
phi3-mini-128k,11,69,37,68,40,67,72,55,12,6
qwen2-72b,4,33,33,33,28,33,33,33,1,1


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'big_dataframe' is your DataFrame loaded from the previous step

# Group by 'model' and 'target_material' and count the occurrences
material_count = big_dataframe.groupby(['model', 'target_material']).size().reset_index(name='counts')

# Create a count plot
plt.figure(figsize=(10, 6))
sns.barplot(data=material_count, x='model', y='counts')
plt.title('Count of Unique Target Materials by Model')
plt.xlabel('Model')
plt.ylabel('Count of Unique Materials')
plt.xticks(rotation=45)
# plt.legend(title='Target Material')
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'big_dataframe' is your DataFrame loaded from the previous step

# Select only numeric columns for boxplots (excluding known non-numeric 'model' and 'target_material')
numeric_features = big_dataframe.select_dtypes(include=[np.number]).columns.tolist()

# Remove the 1st and 99th percentile outliers for each numeric feature
filtered_dataframe = big_dataframe.copy()
for feature in numeric_features:
    lower_bound = filtered_dataframe[feature].quantile(0.05)
    upper_bound = filtered_dataframe[feature].quantile(0.95)
    filtered_dataframe = filtered_dataframe[(filtered_dataframe[feature] >= lower_bound) & (filtered_dataframe[feature] <= upper_bound)]

# Calculate the number of rows and columns for the subplot grid
num_features = len(numeric_features)
num_columns = 3  # You can change this number based on your preference or screen size
num_rows = (num_features + num_columns - 1) // num_columns  # Ensures there are enough rows in the grid

# Setting up the matplotlib figure
plt.figure(figsize=(num_columns * 6, num_rows * 5))  # Adjust the figure size as necessary

# Looping through each numeric feature to create a separate subplot for each
for i, feature in enumerate(numeric_features):
    plt.subplot(num_rows, num_columns, i + 1)  # Creating a subplot for each feature
    sns.boxplot(data=filtered_dataframe, x='model', y=feature)
    plt.title(f'Boxplot of {feature} by Model')
    plt.xlabel('Model')
    plt.ylabel(feature)
    plt.xticks(rotation=90)  # Rotate x-axis labels 90 degrees

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np  # For handling NaNs and infinities

# Assuming big_dataframe is already loaded

# Selecting numerical features for PCA
features = ['target_thickness', 'pulse_width', 'energy', 'spot_size', 'intensity', 'power', 'cutoff_energy']
X = big_dataframe[features]

# Remove infinities and replace them with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values to clean the data
X.dropna(inplace=True)

# Removing 1st and 99th percentile outliers
for feature in features:
    # Calculate the 1st and 99th percentiles
    lower_bound = X[feature].quantile(0.01)
    upper_bound = X[feature].quantile(0.99)
    # Filter out outliers
    X = X[(X[feature] >= lower_bound) & (X[feature] <= upper_bound)]

# Standardizing the features
X_scaled = StandardScaler().fit_transform(X)

# PCA transformation
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Re-integrate the original indices to merge accurately
principal_df.set_index(X.index, inplace=True)

# Concatenate with the model, prompt_method, and sample_size for plotting
final_df = pd.concat([principal_df, big_dataframe.loc[X.index, ['model', 'prompt_method', 'sample_size']]], axis=1)

# Create the PCA plot
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PC1', y='PC2', data=final_df, hue='model', style='prompt_method', size='sample_size',
                sizes=(20, 200), alpha=0.7, palette='deep')
plt.title('PCA of Dataset by Model, Prompt Method, and Sample Size')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()
