In [None]:
import re
import os
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.errors import SettingWithCopyWarning
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler


In [None]:
from sklearn.manifold import TSNE

regex_patterns = {
    'Powderkg': r'Powderkg = (\d+)',
    'wc': r'wc = ([\d.]+)',
    'materials': r'materials = ([\d./]+)',
    'curing': r'curing = (.+)$'
}

# Define the pattern using regular expressions
pattern = r'gpt-3.5-turbo_Generic_prompt_experiment_(\d+)_temp_0.0_target_(\d+)_\%_Dev_Budget_(\d+)_recursive_1_(\d+)\.csv'

matching_filenames = []

# Iterate through the files in the directory
dir_name = 'Results/ID'
for filename in os.listdir(dir_name):
    match = re.match(pattern, filename)
    if match:
        matching_filenames.append(os.path.join(dir_name, filename))

In [None]:
curing_mapping = {'Heat Curing'}
all_results_df = pd.DataFrame()
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

for i, filename in enumerate(matching_filenames):
    results_sample_df = pd.read_csv(filename)
    results_sample_df.head(2)
    for col, pattern in regex_patterns.items():
        all_results_df[f'{col} {i}'] = results_sample_df['Formulation'].str.extract(pattern, expand=False)
        if col.startswith('materials'):
            all_results_df[f'{col} {i}'] = all_results_df[f'{col} {i}'].str[:3].astype(float)
        if col.startswith('curing'):
            all_results_df[f'{col} {i}'][all_results_df[f'{col} {i}'].astype(str).str.startswith('Heat')] = 0
            all_results_df[f'{col} {i}'][all_results_df[f'{col} {i}'].astype(str).str.startswith('Ambient')] = 1

warnings.filterwarnings("default")
all_results_df.head()

In [None]:
# plt.plot(results_sample_df.index, results_sample_df['Powderkg'])
# plt.gca().invert_yaxis()
# plt.show()

In [None]:
features = ['Powderkg', 'wc', 'materials']
for feature in features:
    print(f'####### Feature {feature} #######')
    data = {}
    for i in range(len(matching_filenames)):
        data[f'{feature} {i}'] = list(all_results_df[f'{feature} {i}'].values)

    df = pd.DataFrame(data)
    correlation_matrix = df.corr()
    correlation_matrix = correlation_matrix.apply(lambda x: round(x, 2))

    # NaNs occur when the std is 0, i.e. we have constant functions. We handle this cas by replacing NaN with 1
    correlation_matrix.fillna(1, inplace=True)

    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)

    # Customize the plot (add labels, title, etc. if needed)
    plt.title('Correlation Heatmap between Lineplots')
    plt.show()

    plt.figure(figsize=(8, 6))

    # Add line plots to each subplot
    plt.plot(df.index, df[f'{feature} 1'], label=f'{feature} Experiment 1')
    plt.plot(df.index, df[f'{feature} 2'], label=f'{feature} Experiment 2')
    plt.plot(df.index, df[f'{feature} 3'], label=f'{feature} Experiment 3')

    # Customize the subplots
    plt.legend()

    # Add labels and adjust spacing
    plt.xlabel(feature)
    plt.tight_layout()

    # Display the plot
    plt.show()

    pca = PCA(n_components=1)
    data = pd.DataFrame(correlation_matrix)
    pca.fit(data)

    # Get the first principal component
    first_principal_component = pca.components_[0]

    # Print the explained variance ratio, which tells you the proportion of variance
    # explained by the first principal component.
    explained_variance_ratio = pca.explained_variance_ratio_
    print("Explained Variance Ratio:", explained_variance_ratio)

    # Print the first principal component, which represents the overall relationship
    print("First Principal Component (Overall Relationship):", first_principal_component)

    tsne = TSNE(n_components=2, perplexity=5, random_state=0)
    df_transpose = df.T
    if feature == 'Powderkg':
        all_features_for_tsne = df_transpose.values
    else:
        all_features_for_tsne = np.hstack((all_features_for_tsne, df_transpose.values))
    X_2d = tsne.fit_transform(df_transpose)

    # Create a scatter plot of the t-SNE results
    plt.figure(figsize=(8, 6))
    plt.scatter(X_2d[:, 0], X_2d[:, 1], c='r', marker='o', label=feature)

    plt.title("t-SNE Visualization")
    plt.legend()
    plt.show()

In [None]:
tsne = TSNE(n_components=2, perplexity=5, random_state=0)
X_2d = tsne.fit_transform(all_features_for_tsne)

# Create a scatter plot of the t-SNE results
plt.figure(figsize=(8, 6))
plt.scatter(X_2d[:, 0], X_2d[:, 1], c='r', marker='o', label='All Features')

plt.title("t-SNE Visualization")
plt.legend()
plt.show()

In [None]:
df = pd.DataFrame(all_features_for_tsne).T
df.head(30)

scaler = MinMaxScaler()

scaler.fit(df)

# Transform your DataFrame to apply the scaling
scaled_df = pd.DataFrame(scaler.transform(df), columns=df.columns)

In [None]:
plt.plot(scaled_df.index, scaled_df[1].astype(str))
plt.plot(scaled_df.index, scaled_df[2].astype(str))
plt.gca().invert_yaxis()
plt.show()

In [None]:
correlation_matrix = df.corr()
correlation_matrix = correlation_matrix.apply(lambda x: round(x, 8))

print(correlation_matrix)
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)

# Customize the plot (add labels, title, etc. if needed)
plt.title('Correlation Heatmap between Lineplots')
plt.show()

pca = PCA(n_components=1)
data = pd.DataFrame(correlation_matrix)
pca.fit(data)

# Get the first principal component
first_principal_component = pca.components_[0]

# Print the explained variance ratio, which tells you the proportion of variance
# explained by the first principal component.
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance_ratio)

# Print the first principal component, which represents the overall relationship
print("First Principal Component (Overall Relationship):", first_principal_component)