In [None]:
import pathlib
import pandas as pd

csvfile = '/dls/labxchem/data/2018/lb18145-80/processing/analysis/eugene/pandda_score/training_data_paths/training_set.csv'

df = pd.read_csv(csvfile)



def num_remodelled_vs_not_all_systems(df):
    def num_remodelled_vs_not(system):
        df = df[df.system == system]
        df_remodelled = df[df['remodelled'] == 1]
        df_not_remodelled = df[df['remodelled'] == 0]
        df_remodelled_percentage = len(df_remodelled) / len(df)
        return {'system': system,
                'len_remodelled': len(df_remodelled), 
                'len_not_remodelled': len(df_not_remodelled),
                'remodelled_percentage': df_remodelled_percentage,
                'not_remodelled_percentage': 1 - df_remodelled_percentage}

    systems = df.system.unique()

    lst = list(map(num_remodelled_vs_not, systems))
    return lst

lst = num_remodelled_vs_not_all_systems(df)
data = pd.DataFrame(lst)



In [None]:
#computing fpr, fnr, tpr, tnr, error rate, for all systems
from analysis_notebooks.plotting import confusion_matrix

def compute_tp_fp_tn_fn_all_systems(df, threshold):
    def compute_tp_fp_tn_fn_per_system(system, threshold):
        df_system = df[df.system == system]
        frequency = len(df_system)
        true_pos, true_neg, false_pos, false_neg, system_frame = \
            confusion_matrix.compute_true_false_positives_and_negatives(df_system, 
                                                                        threshold)
        
        return {'system': system,
                'frequency': frequency,
                'true_pos': len(true_pos),
                'true_neg': len(true_neg),
                'false_pos': len(false_pos),
                'false_neg': len(false_neg),}
    
    systems = df.system.unique()
    threshold = [threshold] * len(systems)
    lst = list(map(compute_tp_fp_tn_fn_per_system, systems, threshold))

    return lst

def compute_tpr_fpr_tnr_fnr_from_frame(tp_fp_tn_fn_frame: pd.DataFrame):
    tpr = tp_fp_tn_fn_frame.true_pos / (tp_fp_tn_fn_frame.true_pos + tp_fp_tn_fn_frame.false_neg)
    fpr = tp_fp_tn_fn_frame.false_pos / (tp_fp_tn_fn_frame.false_pos + tp_fp_tn_fn_frame.true_neg)
    tnr = tp_fp_tn_fn_frame.true_neg / (tp_fp_tn_fn_frame.true_neg + tp_fp_tn_fn_frame.false_pos)
    fnr = tp_fp_tn_fn_frame.false_neg / (tp_fp_tn_fn_frame.false_neg + tp_fp_tn_fn_frame.true_pos)
    error_rate = (tp_fp_tn_fn_frame.false_pos + tp_fp_tn_fn_frame.false_neg) / 
    
    tp_fp_tn_fn_frame['tpr'] = tpr
    tp_fp_tn_fn_frame['fpr'] = fpr
    tp_fp_tn_fn_frame['tnr'] = tnr
    tp_fp_tn_fn_frame['fnr'] = fnr
    tp_fp_tn_fn_frame['error_rate'] = error_rate

    return tp_fp_tn_fn_frame

csvfile = '/dls/labxchem/data/2018/lb18145-80/processing/analysis/eugene/pandda_score/training_data_paths/training_set.csv'
df = pd.read_csv(csvfile)
lst = compute_tp_fp_tn_fn_all_systems(df, 0.17)
system_tp_fp_tn_fn_frame = pd.DataFrame(lst)
system_frame = compute_tpr_fpr_tnr_fnr_from_frame(system_tp_fp_tn_fn_frame)

print(system_frame.sort_values(by=['fpr']))
print(system_frame.sort_values(by=['fnr']))
print(system_frame.sort_values(by=['error_rate']))


In [None]:
# computing fpr, fnr, tpr, tnr, error rate, for all datasets
def compute_tp_fp_tn_fn_all_datasets(df, threshold):
    def compute_tp_fp_tn_fn_per_dataset(dataset, threshold):
        df_dataset = df[df.dtag == dataset]
        frequency = len(df_dataset)
        true_pos, true_neg, false_pos, false_neg, dataset_frame = \
            confusion_matrix.compute_true_false_positives_and_negatives(df_dataset, 
                                                                        threshold)
        
        return {'dataset': dataset,
                'frequency': frequency,
                'true_pos': len(true_pos),
                'true_neg': len(true_neg),
                'false_pos': len(false_pos),
                'false_neg': len(false_neg),}
    
    datasets = df.dtag.unique()
    threshold = [threshold] * len(datasets)
    lst = list(map(compute_tp_fp_tn_fn_per_dataset, datasets, threshold))

    return lst

csvfile = '/dls/labxchem/data/2018/lb18145-80/processing/analysis/eugene/pandda_score/training_data_paths/training_set.csv'
df = pd.read_csv(csvfile)
lst = compute_tp_fp_tn_fn_all_datasets(df, 0.17)
dataset_tp_fp_tn_fn_frame = pd.DataFrame(lst)
dataset_frame = compute_tpr_fpr_tnr_fnr_from_frame(dataset_tp_fp_tn_fn_frame)

print(dataset_frame.sort_values(by=['fpr']))
print(dataset_frame.sort_values(by=['fnr']))
print(dataset_frame.sort_values(by=['error_rate']))

