In [1]:
from plotnine import ggplot, aes, geom_point, labs, facet_wrap, geom_vline, theme_minimal, theme, element_text
import pandas as pd
import ast

In [2]:
def get_print_df(df):
    df['prefix_method'] = df['n_layer'].apply(lambda x: 'linear' if x == 0 else 'mlp')

    # Create the 'method' column using string concatenation
    df['method'] = (df['prefix_method'] + "." +
                    df['n_features'].astype(str))

    # Drop the unnecessary columns
    new_df = df[['method', 'acc']]
    return new_df

In [3]:
for dataset_name in ['detailed', 'systematic']:

    figures_path = 'figures/' + dataset_name + "/"
    acc_rate_csv_path = 'acc_rate/' + dataset_name + '.csv'

    # raw_df
    raw_df = pd.read_csv(acc_rate_csv_path)

    # create new columns named n_features
    new_col = []
    for i in range(raw_df.shape[0]):
        new_col.append(len(ast.literal_eval(raw_df['features'].to_numpy()[i])))
    df_n_features = pd.DataFrame(new_col, columns=['n_features'])

    # full df
    df = pd.concat([raw_df, df_n_features], axis=1)

    # get plot df
    df_fold_comparison_list = []
    for fold in range(1, 7):
        df_fold = df[df['fold']==fold]
        df_fold_engineer = df_fold[df_fold['f_engineer']==1]
        df_fold_engineer_linear = df_fold_engineer[df_fold_engineer['n_layer'] == 0]
        df_fold_engineer_mlp    = df_fold_engineer[df_fold_engineer['n_layer'] != 0]
        df_fold_engineer_mlp_1  = df_fold_engineer_mlp[df_fold_engineer_mlp['n_features']==1].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_engineer_mlp_2  = df_fold_engineer_mlp[df_fold_engineer_mlp['n_features']==2].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_engineer_mlp_4  = df_fold_engineer_mlp[df_fold_engineer_mlp['n_features']==4].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_engineer_filtered = pd.concat([df_fold_engineer_linear, df_fold_engineer_mlp_1, df_fold_engineer_mlp_2, df_fold_engineer_mlp_4])
        df_fold_engineer_filtered = get_print_df(df_fold_engineer_filtered)
        df_fold_engineer_filtered['feature'] = 'feature engineering'

        df_fold_rawdata = df_fold[df_fold['f_engineer']!=1]
        df_fold_rawdata_linear = df_fold_rawdata[df_fold_rawdata['n_layer'] == 0]
        df_fold_rawdata_mlp    = df_fold_rawdata[df_fold_rawdata['n_layer'] != 0]
        df_fold_rawdata_mlp_2  = df_fold_rawdata_mlp[df_fold_rawdata_mlp['n_features']==2].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_rawdata_mlp_3  = df_fold_rawdata_mlp[df_fold_rawdata_mlp['n_features']==3].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_rawdata_mlp_5  = df_fold_rawdata_mlp[df_fold_rawdata_mlp['n_features']==5].sort_values(by='acc', ascending=False).iloc[0:1]
        df_fold_rawdata_filtered = pd.concat([df_fold_rawdata_linear, df_fold_rawdata_mlp_2, df_fold_rawdata_mlp_3, df_fold_rawdata_mlp_5])
        df_fold_rawdata_filtered = get_print_df(df_fold_rawdata_filtered)
        df_fold_rawdata_filtered['feature'] = 'raw data'

        df_fold_total = pd.concat([df_fold_rawdata_filtered, df_fold_engineer_filtered])
        df_fold_total['fold'] = 'fold' + str(fold) + '.test'

        df_fold_comparison_list.append(df_fold_total)

    final_df = pd.concat(df_fold_comparison_list)


    # plot
    plot_combined = (ggplot(final_df, aes(x='acc', y='method', color='feature')) +
                    geom_point(shape="D", size=2) +
                    labs(title="dataset: " + dataset_name,
                        x="accuracy percentage",
                        y="method") +
                    facet_wrap('~fold', ncol=3) +
                    geom_vline(xintercept=100, color="black", size=1) +
                    theme_minimal() +
                    theme(legend_position='bottom', text=element_text(size=8)) +
                    theme(aspect_ratio=0.7)
                    )  # Adjust aspect ratio to decrease the distance between methods

    # Displaying the combined plot
    plot_combined.save(figures_path + "feature_engineer_comparison.pdf", width=8, height=6)
    plot_combined.save(figures_path + "feature_engineer_comparison.jpg", width=8, height=6)


