In [9]:
# set run environment (local/colab)
# if colab install required packages and set appropriate root_path
import os

if os.getenv("COLAB_RELEASE_TAG"):
    colab = True
    !pip install transformers[torch]
    !pip install accelerate -U
    !pip install datasets
    !pip install torchinfo
    #ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U
    from google.colab import drive
    drive.mount('/content/drive')
    root_path = '/content/drive/Othercomputers/My computer/EQILLM/'
else:
    colab = False
    root_path = ''


import itertools
import pandas as pd
import openai
import datetime
import os
import csv
import matplotlib.pyplot as plt

from dotenv import load_dotenv, dotenv_values
from tqdm.notebook import tqdm_notebook
from eqillm import finetune, get_log_for_val, validate, val_metrics, yeelight_eow_notification, param_combinations, load_PolarIs, df_to_ds


dotenv_config = dotenv_values('.env')
yeelight_notify = dotenv_config['YEELIGHT_NOTIFY'] if 'YEELIGHT_NOTIFY' in dotenv_config else False

## Load and filter

In [19]:
training_logs = pd.read_csv(os.path.join(root_path, 'output/training_logs.csv'))

# shorten all runs to max 30 epochs
training_logs = training_logs[training_logs['Epoch']<=30]

# drop all runs which have less than 3 epochs
training_logs = training_logs.groupby(['model', 'timestamp']).filter(lambda x: len(x)>=3)
training_logs

# Plot all

In [11]:
col_to_plot = ['F1']


def plot_all(df, col_to_plot, n, ascending=False):
    grouped = df.groupby(['model', 'timestamp'])
    groups_sorted_list = df.groupby(['model', 'timestamp'])[col_to_plot].max().sort_values(col_to_plot, ascending=ascending).index[:n]
    
    plt.figure(figsize=(30,12))
    for name, group_df in grouped:
        ls = ['solid', 'dashdot', 'dotted']
        for i, col in enumerate(col_to_plot):
            if name in groups_sorted_list:
                plt.plot(group_df['Epoch'], group_df[col], label=f'{col}-{"-".join(name)}', linestyle=ls[i])
            else:
                plt.plot(group_df['Epoch'], group_df[col], label='_nolegend_', linestyle='dashed', alpha=0.1)
    
    
    # plt.ylim(ymax=1)
    plt.xlabel('Epoch')
    plt.ylabel(col_to_plot)
    plt.title(f'All models plot: {col_to_plot}')
    plt.legend()
    plt.show()
    
plot_all(df, col_to_plot, 10, ascending=False)

In [18]:
grouped = df.groupby(['model', 'timestamp'])
groups_sorted_list = df.groupby(['model', 'timestamp']).max(col_to_plot).sort_values(col_to_plot, ascending=False).index[:20]
groups_sorted_list

## Check latest

In [13]:
def grouped_plot_individual(grouped, colnames, colors=['red', 'blue', 'yellow', 'green', 'purple']):
    # Individual subplots
    fig, axes = plt.subplots(nrows=len(grouped.groups), ncols=1, figsize=(10, 15))
    for (group_name, group_df), ax in zip(grouped, axes):
        ax_right = ax.twinx()
        lines = []
        ax_label = ''
        ax_right_label = ''
        for col, c in list(zip(colnames, colors)):
            if 'Loss' in col:
                lines+=ax_right.plot(group_df['Epoch'], group_df[col], color=c, label=col)
                ax_right_label += col
            else:
                lines+=ax.plot(group_df['Epoch'], group_df[col], color=c, label=col)
                ax_label += col
        ax.legend(lines, [i.get_label() for i in lines], loc='upper left')
    plt.tight_layout()
    plt.show()


last_n_runs = 5
# pd.to_datetime(df.timestamp, format="%Y-%m-%d_%H-%M").sort_values(ascending=False).unique()
last_n = df.timestamp.sort_values(ascending=False).unique()[:last_n_runs]
grouped = df[df['timestamp'].isin(last_n)].groupby(['model', 'timestamp'])
cols_to_plot = ['Accuracy', 'F1', 'Training Loss', 'Validation Loss']    

grouped_plot_individual(grouped, cols_to_plot)

In [14]:
cond1 = df['model'].isin([i[0] for i in groups_sorted_list])
cond2 = df['timestamp'].isin([i[1] for i in groups_sorted_list])
cond3 = df['binary'] == True
df[cond1 & cond2].groupby(['model', 'timestamp']).max('F1')