This notebook 1) aggregates and averages the five-fold results and 2) implements the model selection procedure based on the repeated cross-validation results. Final outputs are saved as "select_perf.csv."

In [None]:
#Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#embed fonts
import matplotlib
matplotlib.rc('pdf', fonttype=42)
import scipy.stats
import glob
import json
from tqdm import tqdm

## load all results

In [None]:

rdir = 'results_2020-09-14/'
long_vars = ['pred','pred_proba']
frames = []
frames_long = []
feat_frames = []
feat_frames_long = []
#Read benchmark model results
for file in tqdm(glob.glob('../'+rdir+'/*/*/*.json')):
    with open(file,'r') as of:
        results = json.load(of)
    if 'Feat' in file:
        feat_frames.append(results)
    else:
        frames.append(results)


In [None]:
df_results = pd.DataFrame.from_records(frames)
feat_df_results = pd.DataFrame.from_records(feat_frames)
print('df_results columns:',df_results.columns)    
print('models:',df_results.model.unique())

In [None]:
feat_df_results.info()

## down-select FEAT models from runs using heuristic procedure

In [None]:
from model_selection import select_feat_models, smallest_of_best_three_quartiles

feat_df_results_reduced = select_feat_models(feat_df_results, method= smallest_of_best_three_quartiles)
# feat_df_results_reduced = select_feat_models(feat_df_results, method= smallest_of_best_half)
# feat_df_results_reduced['selection'] = 'smallest_of_best_half' 
# feat_df_results_reduced = select_feat_models(feat_df_results, method= best_of_smallest_half)
# feat_df_results_reduced2['selection'] = 'best_of_smallest_half' 
# feat_df_results_reduced3 = select_feat_models(feat_df_results, method= best)
# feat_df_results_reduced3['selection'] = 'best' 
# feat_df_results_reduced4 = select_feat_models(feat_df_results, method= smallest)
# feat_df_results_reduced4['selection'] = 'smallest' 

# feat_df_results_reduced = select_feat_models(feat_df_results)

In [None]:
feat_df_results_reduced.target.unique()
# feat_df_results.target.unique()

In [None]:
# combine dataframes
df_results = df_results.append(feat_df_results_reduced)
# df_results = df_results.append(feat_df_results_reduced2)
# df_results = df_results.append(feat_df_results_reduced3)
# df_results = df_results.append(feat_df_results_reduced4)
# df_results_long.append(feat_df_results_long_reduced)

In [None]:
df_results.isna().any()
df_results.model.unique()

In [None]:
df_results.isna().any()

## make nice labels

In [None]:
models = ['RandomForest',
          'DecisionTree',
          'Feat_boolean',
          'Feat_boolean_L1',
          'GaussianNaiveBayes',
          'LogisticRegression_L2',
          'LogisticRegression_L1']
model_nice = ['RF',
          'DT',
          'FEAT',
          'FEAT L1',
          'GNB',
          'LR L2',
          'LR L1']
nice_model_labels = {k:v for k,v in zip(models,model_nice)}
df_results['model_nice'] = df_results['model'].apply(lambda x: nice_model_labels[x])


In [None]:

dnames = ['HtnHeuri','HtnHypoKHeuri',"ResHtnHeuri", 'Htndx',"HtnHypoKdx","ResHtndx"]
dnames_nice = [ 'HTN Heuristic','HTN-Hypokalemia Heuristic',"Resistant HTN Heuristic",
                'HTN Diagnosis',"HTN-Hypokalemia Diagnosis","Resistant HTN Diagnosis"]
dnames_to_nice = {k:v for k,v in zip(dnames, dnames_nice)}

### export df_results

In [None]:
#Save output to select_perf.csv
import os
if not os.path.exists(rdir):
    os.mkdir(rdir)
df_results.to_csv(rdir + '/select_perf.csv')

## Average values by run (average all folds)

In [None]:
df_results['size'] = df_results['size'].astype(float)
df_results_ave = df_results.groupby(['model_nice','target','RunID'], as_index=False).mean()
# df_results_ave = df_results.groupby(['model_nice','target','RunID','selection'], as_index=False).mean()
df_results_ave.head()

In [None]:
df_results_ave

## make table in docx

In [None]:
from docx import Document
from docx.shared import Inches, Pt

document = Document()

table = document.add_table(rows=1, cols=5)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Phenotype'
hdr_cells[1].text = 'Method'
hdr_cells[2].text = 'Median CV AUPRC (IQR)'
hdr_cells[3].text = 'Median CV AUROC (IQR)'
hdr_cells[4].text = 'Median Size (IQR)'
# for target, dft in df_results_ave.groupby('target'):
i = 0
for target in dnames:
    dft = df_results_ave.loc[df_results_ave.target == target]
    j = 0
    for model in ['GNB','DT','LR L1','LR L2','RF','FEAT']:
        dftm = dft.loc[dft.model_nice == model]
        i += 1
        j += 1
        table.add_row()
        cells = table.rows[i].cells
        if j == 4:
            cells[0].text = dnames_to_nice[target]
        cells[1].text = model
        cells[2].text = '{:0.2f} (+-{:0.2f})'.format(
            dftm['average_precision_score_test'].median(),
            dftm['average_precision_score_test'].quantile(0.75) 
            - dftm['average_precision_score_test'].quantile(0.25)
        )
        cells[3].text = '{:0.2f} (+-{:0.2f})'.format(
            dftm['roc_auc_score_test'].median(),
            dftm['roc_auc_score_test'].quantile(0.75) 
            - dftm['roc_auc_score_test'].quantile(0.25)
            )
        cells[4].text = '{:0.2f} (+-{:0.2f})'.format(
            dftm['size'].median(),
            dftm['size'].quantile(0.75) 
            - dftm['size'].quantile(0.25)
            )
        
# for qty, id, desc in records:
#     row_cells = table.add_row().cells
#     row_cells[0].text = str(qty)
#     row_cells[1].text = id
#     row_cells[2].text = desc

document.add_page_break()
if not os.path.exists('tables'):
    os.mkdir('tables')
document.save('tables/Table_1_model_performance.docx')


In [None]:
df_results_ave.model_nice.unique()
df_results_ave[df_results_ave.model_nice=='FEAT'].target.unique()

## make boxplots

In [None]:
import os 
import math
stat_to_nice = {
    'average_precision_score_train':'AUPRC, Train',
    'average_precision_score_test':'AURPRC, Test',
    'size':'Model Size'
}

def make_boxplot(df, stats, targets=None, order=None, hue=None, hue_order=None, name=None):
    if type(stats) is not list:
        stats = [stats]
    if targets == None:
        targets = dnames
        nrows = 2
        ncols = 3
    else:
        nrows = len(stats)
        ncols = len(targets)
#         ncols = math.ceil(len(targets)/nrows)
    if order == None: 
        order = ['GNB','DT', 'LR L2','LR L1','RF','FEAT']
    if hue == None:
        color = 'w'
        palette = None
    else:
        color = None
        palette = 'colorblind'
    
    sns.set_style('whitegrid')
    fig = plt.figure(figsize=(5*ncols,4*nrows))

    i = 1
    for stat in stats:
        for target in targets:
            df = df_results_ave[df_results_ave.target==target]
            ax = fig.add_subplot(nrows,ncols,i)
            sns.boxplot(x="model_nice", 
                        y=stat, 
                        data=df, 
                        color = color,
                        ax=ax, 
                        order = order,
                        notch = True,
                        hue=hue,
                        hue_order=hue_order,
                        palette=palette
                       )
            # make box edges black
            for j, box in enumerate(ax.artists):
                box.set_edgecolor('black')
                # iterate over whiskers and median lines
                for k in range(6*j,6*(j+1)):
                     ax.lines[k].set_color('black') 

            # TODO: 
                # rename titles of datasets
                # same y scaling?
                # smaller names of models?
            if i != ncols * nrows:
                try:
                    ax.legend_.remove()
                except:
                    pass
                
            if i in [1,1+ncols]:
                if stat in stat_to_nice.keys():
                    plt.ylabel(stat_to_nice[stat])
            else: 
                plt.ylabel('')
            if nrows>1 and i <= ncols:
                ax.set_xticklabels([])
            else:
                ax.set_xticklabels(ax.xaxis.get_majorticklabels(),rotation=0)
            plt.xlabel('')
            if stat == 'size':
                plt.gca().set_yscale('log')
        #     plt.ylim(0.3,1.05)
            plt.title(dnames_to_nice[target])
            i += 1
    # save figure
    plt.tight_layout()
    if not os.path.exists('figs/'+rdir):
        os.mkdir('figs/'+rdir)
    for filetype in ['.png','.pdf','.eps','.svg']:
        if name != None:
            plt.savefig('figs/'+rdir+'/'+name+filetype, dpi=400)
        else:
            plt.savefig('figs/'+rdir+'/boxplot_'+'_'.join(targets+stats)+filetype,dpi=400)

In [None]:
# hue_order = ['best','smallest_of_best_half','best_of_smallest_half','smallest']
# make_boxplot(df_results_ave.loc[df_results_ave.model_nice == 'FEAT'], 'average_precision_score_test',
#              order=['FEAT'], hue='selection', 
#              hue_order = hue_order, 
#              name = 'boxplot_FEAT_selection_comparison_aps')
# make_boxplot(df_results_ave.loc[df_results_ave.model_nice == 'FEAT'], 'size', 
#              order=['FEAT'], hue='selection',
#              hue_order=hue_order,
#              name = 'boxplot_FEAT_selection_comparison_size'
#             )

In [None]:
df_to_plot = df_results_ave
# df_to_plot = df_results_ave.loc[~df_results_ave.selection.isin(['best','smallest','best_of_smallest_half'])]
make_boxplot(df_to_plot, 'average_precision_score_train')
make_boxplot(df_to_plot, 'average_precision_score_test')
make_boxplot(df_to_plot, 'size')

### make specific subplot boxplots

In [None]:
make_boxplot(df_results, 'average_precision_score_test', targets = dnames[:3])
make_boxplot(df_results, 'size', targets = dnames[:3])

In [None]:
make_boxplot(df_results, ['average_precision_score_test','size'], targets = dnames[1:3])

## Pareto tradeoffs

In [None]:
stat_to_nice = {
    'average_precision_score_train':'AUPRC, Train',
    'average_precision_score_test':'AUPRC, Test',
    'size':'Model Size'
}
order = ['GNB','DT', 'LR L2','LR L1','RF','FEAT']
sns.set_style('whitegrid')
fig = plt.figure(figsize=(12,6))
markers = ('s','^','h','p','D','o')
marker_choice = {
    'GNB':'s',
    'DT':'^',
    'LR L2':'h',
    'LR L1':'p',
    'RF':'D',
    'FEAT':'o'
}
i = 1
for target in dnames:
    df = df_results_ave[df_results_ave.target==target]
    print('df models:',df.model_nice.unique())
    df = df[df.model_nice.isin(order)]
    print('df models order filter:',df.model_nice.unique())
    
    ax = fig.add_subplot(2,3,i)
    
    g = sns.scatterplot(data = df,
                    x = 'size', 
                    y = 'average_precision_score_test',
                    hue='model_nice',
                    hue_order=order,
                    palette='colorblind',
#                     marker=marker_choice[df['model_nice']],
                    style = 'model_nice',
#                     markers = ('^','o', 's', 'p', 'h', 'D', 'P', 'X', '*','v', '<', '>',),
#                     markers = list(marker_choice.values())
#                     markers=markers
                   )
    plt.gca().set_xscale('log')
    if i != 6:
        plt.gca().legend_.remove()
    else:
        handles, _ = ax.get_legend_handles_labels()
        plt.legend(handles,order)

    if i in [1,4]:
        plt.ylabel(stat_to_nice['average_precision_score_test'])
    else: 
        plt.ylabel('')
    if i > 3:
        plt.xlabel(stat_to_nice['size'])
    else: 
        plt.xlabel('')
    plt.title(dnames_to_nice[target])
    i += 1
# save figure
plt.tight_layout()
if not os.path.exists('figs/'+rdir):
    os.mkdir('figs/'+rdir)
for filetype in ['.png','.pdf','.eps','.svg']:
    plt.savefig('figs/'+rdir+'/pareto_plot_all'+filetype,dpi=400)

# pareto tradeoffs, both objectives minimized

In [None]:
stat_to_nice = {
    'average_precision_score_train':'AUPRC, Train',
    'average_precision_score_test':'AUPRC, Test',
    'size':'Model Size'
}
order = ['GNB','DT', 'LR L2','LR L1','RF','FEAT']
sns.set_style('whitegrid')
fig = plt.figure(figsize=(12,6))
markers = ('s','^','h','p','D','o')
marker_choice = {
    'GNB':'s',
    'DT':'^',
    'LR L2':'h',
    'LR L1':'p',
    'RF':'D',
    'FEAT':'o'
}
i = 1
for target in dnames:
    df = df_results_ave[df_results_ave.target==target]
    print('df models:',df.model_nice.unique())
    df = df[df.model_nice.isin(order)]
    print('df models order filter:',df.model_nice.unique())
    
    ax = fig.add_subplot(2,3,i)
    
    df['neg_average_precision_score_test'] = 1 - df['average_precision_score_test']
    g = sns.scatterplot(data = df,
                    x = 'size', 
                    y = 'neg_average_precision_score_test',
                    hue='model_nice',
                    hue_order=order,
                    palette='colorblind',
#                     marker=marker_choice[df['model_nice']],
                    style = 'model_nice',
#                     markers = ('^','o', 's', 'p', 'h', 'D', 'P', 'X', '*','v', '<', '>',),
#                     markers = list(marker_choice.values())
#                     markers=markers
                   )
    plt.gca().set_xscale('log')
    if i != 6:
        plt.gca().legend_.remove()
    else:
        handles, _ = ax.get_legend_handles_labels()
        plt.legend(handles,order)

    if i in [1,4]:
        plt.ylabel('1 - ' +stat_to_nice['average_precision_score_test'])
    else: 
        plt.ylabel('')
    if i > 3:
        plt.xlabel(stat_to_nice['size'])
    else: 
        plt.xlabel('')
    plt.title(dnames_to_nice[target])
    i += 1
# save figure
plt.tight_layout()
if not os.path.exists('figs/'+rdir):
    os.mkdir('figs/'+rdir)
for filetype in ['.png','.pdf','.eps','.svg']:
    plt.savefig('figs/'+rdir+'/pareto_plot_all_minimized'+filetype,dpi=400)

## statistical tests

In [None]:
#Statistical test for AUPRCs modeling Heuristics (Wilcoxon rank-sum)
from scipy.stats import wilcoxon
from statannot import add_stat_annotation
import pdb
sns.set_style('whitegrid')
def stats_test_box(df, datasetses, comparisons):
    """Compare results from df1 and df2 over models on datasets according to comparison.
        df1 should contain one model
        df2 should contain the other models to test over
    """
    fig = plt.figure(figsize=(14,7))
    plot_no = 0
    for datasets in datasetses:
        for comparison in comparisons:
            plot_no += 1
            ax = fig.add_subplot(1, len(datasetses)*len(comparisons),plot_no)
                
            rank_name = comparison+'_rank'
            median_name = comparison+'_rank'
            # filter to selected datasets
            df_filt = df[df.target.isin(datasets)]
            # get ranks
            ascending = True if comparison == 'size' else False
            df_filt[rank_name] = df_filt.groupby(['target','RunID'])[comparison].rank(ascending=ascending)
#             df_filt[median_name] = df_filt.groupby(['target','RunID'])[comparison].median().reset_index()

            df_median = df_filt.groupby(['model_nice','target','RunID']).median().reset_index()
#             print('df_median:',df_median)

            # get FEAT ranks
            rank1 = df_filt[df_filt.model_nice == 'FEAT'][rank_name]
            score1 = df_median[df_median.model_nice == 'FEAT'][comparison]

            df2 = df_filt[df_filt.model_nice != 'FEAT']
            p = {}
            test_count = 0
            for model,dfm2 in df2.groupby('model_nice'):
                rank2 = dfm2[rank_name]
                score2 = df_median[df_median.model_nice == model][comparison]
#                 score2 = dfm2[median_name]
                #Wilcoxon rank sum test
#                 print(rank1,rank2)
                _, p[('FEAT', model)] = wilcoxon(rank1, rank2, alternative='two-sided')
                test_count += 1
                # print absolute differences
                print(datasets,'difference in',comparison,', Feat vs',model,':',
                      (score1.mean()-score2.mean())/(score1.mean())
                     )
            
            # bonferroni correction for multiple comparisons
        #     p_adjusted = {}
        #     for k,v in p.items():
        #         p_adjusted[k] = v * test_count
        #         print(k,p_adjusted[k],'*' if p_adjusted[k] < 0.05 else '')
            p_adjusted = p


            order = ['GNB','DT','LR L2','LR L1','FEAT','RF']
            g = sns.boxplot(data=df_filt, x='model_nice',y=comparison,
                            order=order,
                            fliersize=False,
#                             notch=True,
#                             edgecolor=(0,0,0),
#                             linewidth=2,
                            ax = ax,
        #                     fill=False
                            color='w'
                           )
            # make box edges black
            for j, box in enumerate(ax.artists):
                box.set_edgecolor('black')
                # iterate over whiskers and median lines
                for k in range(6*j,6*(j+1)):
                     ax.lines[k].set_color('black') 
            plt.xlabel('')
            plt.ylabel(comparison.replace('_',' ').title())
            if comparison=='size':
                ax.set_yscale('log')
            if len(datasets) == 3 and all(['Heuri' in d for d in datasets]):
                plt.title('Heuristics')
            elif len(datasets) == 3 and all(['dx' in d for d in datasets]):
                plt.title('Chart Review')
            else:
                plt.title(','.join(datasets))
            alpha = 0.001
            pvalues = { bp: pval for bp, pval in p_adjusted.items() if pval < alpha }
            
#             box_pairs, pvalues = zip(*pvalues.items())
            box_pairs, pvalues = zip(*p_adjusted.items())
            
            add_stat_annotation(g, data=df_median, x='model_nice', y=comparison,
                                box_pairs=box_pairs, 
                                pvalues=pvalues, 
                                perform_stat_test=False,
                                order=order, 
                                text_format='simple',
                                show_test_name=False,
                                pvalue_thresholds = [
                                                     [1e-6, "1e-6 ****"], 
                                                     [1e-5, "1e-5 ***"], 
                                                     [1e-4, "1e-4 **"],
                                                     [1e-3, "1e-3 *"], 
                                                     [0.01, "1e-2 "], 
                                                     [0.1, "1e-1"], 
                                                     [1, "1 (ns)"],
                                                    ],
                               )
    if not os.path.exists('figs/'+rdir):
        os.mkdir('figs/'+rdir)
    for filetype in ['.png','.pdf','.eps','.svg']:
        plt.tight_layout()
        plt.savefig('figs/'+rdir+'/rankings_boxplot_'+'-'.join(comparisons)
                    +'_'.join(['-'.join(d) for d in datasetses])+filetype,
                    dpi=400)

    return p_adjusted


In [None]:
#Statistical test for AUPRCs modeling Heuristics (Wilcoxon rank-sum)
from scipy.stats import wilcoxon
from statannot import add_stat_annotation
import pdb
sns.set_style('whitegrid')
def stats_test_bar(df, datasetses, comparisons):
    """Compare results from df1 and df2 over models on datasets according to comparison.
        df1 should contain one model
        df2 should contain the other models to test over
    """
    fig = plt.figure(figsize=(7,7))
    plot_no = 0
    for datasets in datasetses:
        for comparison in comparisons:
            plot_no += 1
            ax = fig.add_subplot(len(comparisons),len(datasetses),plot_no)
                
            rank_name = comparison+'_rank'
            median_name = comparison+'_rank'
            # filter to selected datasets
            df_filt = df[df.target.isin(datasets)]
            # get ranks
            ascending = True if comparison == 'size' else False
            df_filt[rank_name] = df_filt.groupby(['target','RunID'])[comparison].rank(ascending=ascending)
#             df_filt[median_name] = df_filt.groupby(['target','RunID'])[comparison].median().reset_index()

            df_median = df_filt.groupby(['model_nice','target','RunID']).median().reset_index()
#             print('df_median:',df_median)

            # get FEAT ranks
            rank1 = df_filt[df_filt.model_nice == 'FEAT'][rank_name]
            score1 = df_median[df_median.model_nice == 'FEAT'][comparison]

            df2 = df_filt[df_filt.model_nice != 'FEAT']
            p = {}
            test_count = 0
            for model,dfm2 in df2.groupby('model_nice'):
                rank2 = dfm2[rank_name]
                score2 = df_median[df_median.model_nice == model][comparison]
#                 score2 = dfm2[median_name]
                #Wilcoxon rank sum test
#                 print(rank1,rank2)
                _, p[('FEAT', model)] = wilcoxon(rank1, rank2, alternative='two-sided')
                test_count += 1
                # print absolute differences
                print(datasets,'difference in',comparison,', Feat vs',model,':',
                      (score1.mean()-score2.mean())/(score1.mean())
                     )
            
            # bonferroni correction for multiple comparisons
        #     p_adjusted = {}
        #     for k,v in p.items():
        #         p_adjusted[k] = v * test_count
        #         print(k,p_adjusted[k],'*' if p_adjusted[k] < 0.05 else '')
            p_adjusted = p


            order = ['GNB','DT','LR L2','LR L1','FEAT','RF']
            g = sns.barplot(data=df_filt, x='model_nice',y=comparison,
                            order=order,
                            edgecolor=(0,0,0),
#                             linewidth=2,
                            ax = ax,
        #                     fill=False
                            color='w'
                           )
            # make box edges black
#             for j, box in enumerate(ax.artists):
#                 box.set_edgecolor('black')
#                 # iterate over whiskers and median lines
#                 for k in range(6*j,6*(j+1)):
#                      ax.lines[k].set_color('black') 
            plt.xlabel('')
            if 'precision' in comparison.lower():
                plt.ylabel('AUPRC, Test')
            else:
                plt.ylabel(comparison.replace('_',' ').title())
            if comparison=='size':
                ax.set_yscale('log')
            if len(datasets) == 3 and all(['Heuri' in d for d in datasets]):
                plt.title('Heuristics')
            elif len(datasets) == 3 and all(['dx' in d for d in datasets]):
                plt.title('Chart Review')
            else:
                plt.title(','.join(datasets))
            alpha = 0.001
            pvalues = { bp: pval for bp, pval in p_adjusted.items() if pval < alpha }
            
#             box_pairs, pvalues = zip(*pvalues.items())
            box_pairs, pvalues = zip(*p_adjusted.items())
            
            add_stat_annotation(g, data=df_median, x='model_nice', y=comparison,
                                box_pairs=box_pairs, 
                                pvalues=pvalues, 
                                perform_stat_test=False,
                                order=order, 
                                text_format='simple',
                                show_test_name=False,
                                pvalue_thresholds = [
                                                     [1e-6, "1e-6 ****"], 
                                                     [1e-5, "1e-5 ***"], 
                                                     [1e-4, "1e-4 **"],
                                                     [1e-3, "1e-3 *"], 
                                                     [0.01, "1e-2 "], 
                                                     [0.1, "1e-1"], 
                                                     [1, "1 (ns)"],
                                                    ],
                               )
    if not os.path.exists('figs/'+rdir):
        os.mkdir('figs/'+rdir)
    for filetype in ['.png','.pdf','.eps','.svg']:
        plt.tight_layout()
        plt.savefig('figs/'+rdir+'/rankings_barplot_'+'-'.join(comparisons)
                    +'_'.join(['-'.join(d) for d in datasetses])+filetype,
                    dpi=400)

    return p_adjusted


In [None]:
import pandas as pd
heuristics =['HtnHeuri','HtnHypoKHeuri',"ResHtnHeuri"]
diagnoses = ['Htndx',"HtnHypoKdx","ResHtndx"]
datasetses = [heuristics, diagnoses]
# datasetses = [[[h] for h in heuristics] + [[d] for d in diagnoses]]
# datasetses = [heuristics[:] + diagnoses[:],
#               heuristics, diagnoses,
#               ] + [[h] for h in heuristics] + [[d] for d in diagnoses]
comparisons = ['average_precision_score_test', 'size']
pvalues = []
df_feat_ave = df_results_ave.loc[df_results_ave.model_nice == 'FEAT']
df_other_ave = df_results_ave.loc[df_results_ave.model_nice != 'FEAT']
# for datasets in datasetses:
#     for comparison in comparisons:
print(datasetses, comparisons)
result = stats_test_bar(df_results_ave, datasetses, comparisons)

stats_test_box(df_results_ave, [[d] for d in diagnoses], comparisons)
# pdict['Comparison'] = comparison
# pdict['Datasets'] = ', '.join(datasets)
# pdict.update(result)
# pvalues.append(pdict)
# print(p_df)