In [None]:
DATSET_TO_ANALYSE = "pacs"

In [None]:
import os
import ast
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import numpy as np
import re
import statsmodels.api as sm


def build_df(filename):
    with open("optuna_logs/" + filename, "r") as file:
        content = file.read()
    records = []
    for line in content.split("\n"):
        if not line.startswith("Current value: "):
            continue
        params = ast.literal_eval(line[line.index("Current params: ") + 16 :])
        loss = float(line.split(",")[0].removeprefix("Current value: "))
        record = {"LOSS": loss}
        record.update(params)
        records.append(record)
    df = pd.DataFrame(records)
    df = df.sort_values("LOSS").reset_index(drop=True)
    return df


builder = []
for filename in os.listdir("optuna_logs"):
    _, dataset_name, architecture, target_domain, _, pretrained, _, augmented, _, _ = (
        filename.replace("art_painting", "art-painting").split("_")
    )
    target_domain = target_domain.replace("-", "_")
    if dataset_name != DATSET_TO_ANALYSE:
        continue
    df = build_df(filename)
    df["architecture"] = architecture
    df["target_domain"] = target_domain
    df["pretrained"] = True if pretrained == "True" else False
    df["augmented"] = True if augmented == "True" else False

    builder.append(df)

df = pd.concat(builder)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

var_color = "architecture"

for param in df.columns:
    if (
        param != "LOSS" and param != var_color
    ):  # Avoid plotting LOSS against itself and the color column
        plt.figure()

        sample = df[~df[param].isna()]

        # Get unique values from the color variable
        unique_values = sample[var_color].unique()

        # Create a color map based on unique values
        colors = plt.get_cmap("viridis", len(unique_values))  # Adjust colormap length

        # Factorize the categorical variable
        color_indices = pd.factorize(sample[var_color])[
            0
        ]  # Converts categories to integers

        # Create scatter plot
        plt.scatter(
            sample["LOSS"].values, sample[param].values, c=color_indices, cmap="viridis"
        )

        plt.xlim(0, 4)
        plt.xlabel("Loss")
        plt.ylabel(param)
        plt.title(f"Effect of {param} on test loss")

        # Create a colorbar with the unique values as labels
        cbar = plt.colorbar(ticks=range(len(unique_values)))
        cbar.ax.set_yticklabels(
            unique_values
        )  # Set colorbar labels to the unique values in var_color
        print(param)
        plt.show()

In [None]:
df.columns

In [None]:
df["resnet50"] = df["architecture"] == "ResNet50"
data = df[
    [
        "USE_AUGMIX",
        "USE_JIGSAW",
        "USE_FOURIER",
        "USE_DLOW",
        "MIN_GRID_SIZE",
        "MAX_GRID_SIZE",
        "pretrained",
        "LEARNING_RATE",
        "resnet50"

    ]
]

y = df["LOSS"][~data.isna().any(axis=1)]

data = data.dropna()
data["USE_AUGMIX"] = data["USE_AUGMIX"].astype(int)
data["USE_JIGSAW"] = data["USE_JIGSAW"].astype(int)
data["USE_FOURIER"] = data["USE_FOURIER"].astype(int)
data["USE_DLOW"] = data["USE_DLOW"].astype(int)
data["pretrained"] = data["pretrained"].astype(int)
data["LEARNING_RATE"] = data["LEARNING_RATE"].astype(float)
data["resnet50"] = data["resnet50"].astype(int)

#sample = df[df["architecture"] == "ResNet50"]


X = sm.add_constant(data)


# Fitting the linear regression model
model = sm.OLS(y, X).fit()

model.summary()

In [None]:
df.columns

In [None]:
df["resnet50"] = df["architecture"] == "ResNet50"
data = df[
    [
        "augmented",
        "pretrained",
        "LEARNING_RATE",
        "resnet50"

    ]
]

y = df["LOSS"][~data.isna().any(axis=1)]

data = data.dropna()
data["pretrained"] = data["pretrained"].astype(int)
data["LEARNING_RATE"] = data["LEARNING_RATE"].astype(float)
data["resnet50"] = data["resnet50"].astype(int)
data["augmented"] = data["augmented"].astype(int)

#sample = df[df["architecture"] == "ResNet50"]


X = sm.add_constant(data)


# Fitting the linear regression model
model = sm.OLS(y, X).fit()

model.summary()

In [None]:
df.columns

In [None]:
df["resnet50"] = df["architecture"] == "ResNet50"
data = df[
    [
        "USE_AUGMIX",
        "USE_JIGSAW",
        "USE_FOURIER",
        "USE_DLOW",
        "pretrained",
        "LEARNING_RATE",
        "resnet50"

    ]
]

y = df["LOSS"]

#data = data.dropna()
data["USE_AUGMIX"] = data["USE_AUGMIX"].fillna(0).astype(int)
data["USE_JIGSAW"] = data["USE_JIGSAW"].fillna(0).astype(int)
data["USE_FOURIER"] = data["USE_FOURIER"].fillna(0).astype(int)
data["USE_DLOW"] = data["USE_DLOW"].fillna(0).astype(int)
data["pretrained"] = data["pretrained"].astype(int)
#data["LEARNING_RATE"] = data["LEARNING_RATE"].astype(float)
data["resnet50"] = data["resnet50"].astype(int)



X = sm.add_constant(data)


# Fitting the linear regression model
model = sm.OLS(y, X).fit()

model.summary()

In [None]:
data.isna().any()