In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
#pd.set_option('display.max_columns', None)

### Deliverables
PN vs RN: <br>
    - test Hypothesis: indexing “partial nephrectomy”, “deep margin”, “renal mass” from "Final Diagnosis" can classify reliably PN vs RN
        - find out how many PNs vs (according to gross description keywords) don't have these terms

### Workflow
1. Load path_to_class_022524.csv, transposed pathology_report with gross description, final diagnosis and clinical history columns
2. Classify keep vs. not with "nephrectomy, kidney tumor, renal mass" in "Final Diagnosis"
3. Turn into flow with csv input after finalizing classificaiton rules

### Functions

In [2]:
def sample(df, purpose_str, keep_ind, keeps_per_class, nokeeps_per_class, random_state=None):
    keep_df = df[df[keep_ind] == 1]  # Subset where keep_ind is 1
    nokeep_df = df[df[keep_ind] == 0]  # Subset where keep_ind is 0

    sample_1 = keep_df.sample(keeps_per_class, random_state=random_state)  # random_state for reproducibility
    sample_0 = nokeep_df.sample(nokeeps_per_class, random_state=random_state)
    
    sample_full=pd.concat([sample_1, sample_0])[['procedure_name', 'final diagnosis_10950', 'clinical history_9935', 'gross description_1230000151',
                                                 'gross description_9997',keep_ind]]
    sample_full.reset_index(drop=True, inplace=True)
    #export
    sample_full.to_csv('sample_'+purpose_str+str(random_state)+'.csv')

In [3]:
def should_keep(row, columns_to_check, keywords_to_keep):
    for column in columns_to_check:
        value = row[column]
        if any(keyword in str(value).lower() for keyword in keywords_to_keep):
            return 1
    return 0

In [4]:
def exclude(row, columns_to_check, keywords_to_exclude):
    for column in columns_to_check:
        value = row[column]
        if any(keyword in str(value).lower() for keyword in keywords_to_exclude):
            return 0
    return 1

In [5]:
def keep_excl(value):
    if any(keyword in value for keyword in keywords_to_keep):
        if not any(exclude_keyword in value for exclude_keyword in keywords_exclude):
            return 1
    return 0

### 1. Load path reports with all external_names, retain rows where value contains "kidney/renal"

In [6]:
path_to_class_pnrn = pd.read_csv("path_to_class_022524.csv", index_col=0)

In [7]:
print(f"{path_to_class_pnrn.pat_id.nunique()} patients with pathology reports to classify")

1158 patients with pathology reports to classify


### 2. Classify PN vs. RN with 
partial nephrectomy: idnexed with “partial nephrectomy”, “deep margin”, “renal mass” in "Final Diagnosis" <br>
How many pn=1 (according to gross description keywords) don't have these terms in Final Diagnosis

3a. Define Keywords

In [8]:
pn_fd = ['partial nephrectomy', 'deep margin', 'renal mass'] #fd: final diagnosis 
pn_gd = ["partial nephrect", "heminephrect", "hemi-nephrect"] #gd: gross description 

Classify based on Gross Description

In [9]:
path_to_class_pnrn['pn_gd_ind']=path_to_class_pnrn.apply(lambda row: should_keep(row, ['gross description_9997', 'gross description_1230000151']
                                                                         , pn_gd), axis=1)
path_to_class_pnrn['pn_fd_ind']=path_to_class_pnrn.apply(lambda row: should_keep(row, ['final diagnosis_10950']
                                                                         , pn_fd), axis=1)

Instances of pn_gd_ind = 1 but "final diagnosis" does not contain ['partial nephrectomy', 'deep margin', 'renal mass']

In [10]:
path_pngd_nofdkeys=path_to_class_pnrn[(path_to_class_pnrn['pn_gd_ind']==1) & (path_to_class_pnrn['pn_fd_ind']==0)].reset_index(drop=True)

In [11]:
print(f"{len(path_pngd_nofdkeys)} observations concerning {path_pngd_nofdkeys.pat_id.nunique()} patients are classified as PN under pn_gd but not pn_fd.")

12 observations concerning 10 patients are classified as PN under pn_gd but not pn_fd.


### Validate: sample of 100 using final diagnosis to classify

In [13]:
path_to_class_pnrn_valid_fd = path_to_class_pnrn[path_to_class_pnrn['final diagnosis_10950'].notna()]
sample(path_to_class_pnrn_valid_fd, "pn_fd_", 'pn_fd_ind', 50, 50, random_state=8)
# sample_pn_fd = pd.read_csv("sample_pn_fd.csv", index_col=0)

All correct!