In [1]:
from plotnine import ggplot, aes, geom_point, labs, facet_wrap, geom_vline, theme_minimal, theme, element_text
import pandas as pd
import ast

In [2]:
def get_print_df(df):
    df['prefix_method'] = df['n_layer'].apply(lambda x: 'linear' if x == 0 else 'mlp')

    # Create the 'method' column using string concatenation
    df['method'] = (df['prefix_method'] + "." +
                    df['n_features'].astype(str))

    # Drop the unnecessary columns
    new_df = df[['method', 'acc']]
    return new_df

In [3]:
for dataset_name in ['detailed', 'systematic']:
    # path
    figures_path = 'figures/' + dataset_name + "/"
    acc_rate_csv_path = 'acc_rate/' + dataset_name + '.csv'

    # raw_df
    raw_df = pd.read_csv(acc_rate_csv_path)

    # create new columns named n_features
    new_col = []
    for i in range(raw_df.shape[0]):
        new_col.append(len(ast.literal_eval(raw_df['features'].to_numpy()[i])) + 1)
    df_n_features = pd.DataFrame(new_col, columns=['n_features'])

    # full df
    df = pd.concat([raw_df, df_n_features], axis=1)

    # Initialize an empty list to store dataframes for each fold
    df_fold_comparison_list = []

    for fold in range(1, 7):
        df_fold = df[df['fold']==fold]
        df_fold_fengineer = df_fold[df_fold['f_engineer'] == 1]
        df_fold_fengineer_linear = df_fold_fengineer[df_fold_fengineer['n_layer'] == 0]
        df_fold_fengineer_mlp = df_fold_fengineer[df_fold_fengineer['n_layer'] != 0]
        df_fold_fengineer_mlp_best_2 = df_fold_fengineer_mlp[df_fold_fengineer_mlp['n_features']==2].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_fengineer_mlp_best_3 = df_fold_fengineer_mlp[df_fold_fengineer_mlp['n_features']==3].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_fengineer_mlp_best_5 = df_fold_fengineer_mlp[df_fold_fengineer_mlp['n_features']==5].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_comparison = pd.concat([df_fold_fengineer_linear, df_fold_fengineer_mlp_best_2, df_fold_fengineer_mlp_best_3, df_fold_fengineer_mlp_best_5])
        df_fold_comparison = get_print_df(df_fold_comparison)
        df_fold_comparison['fold'] = 'fold' + str(fold) + '.test'
        df_fold_comparison_list.append(df_fold_comparison)

    # Concatenate all dataframes in the list into one dataframe
    final_df = pd.concat(df_fold_comparison_list)

    # plot
    plot_combined = (ggplot(final_df, aes(x='acc', y='method')) +
                    geom_point(shape="D", size=2) +
                    labs(title="dataset: " + dataset_name,
                        x="accuracy percentage",
                        y="method") +
                    facet_wrap('~fold', ncol=3) +
                    geom_vline(xintercept=100, color="black", size=1) +
                    theme_minimal() +
                    theme(legend_position='bottom', text=element_text(size=8)) +
                    theme(aspect_ratio=0.7)
                    )

    # print(plot_combined)
    plot_combined.save(figures_path + "acc_comparison.pdf", width=8, height=6)
    plot_combined.save(figures_path + "acc_comparison.jpg", width=8, height=6)

