In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

Helper functions, to apply keep and exclude keyword lists

In [2]:
def keep(row, columns_to_check, keywords_to_keep):
    for column in columns_to_check:
        value = row[column]
        if any(keyword in str(value).lower() for keyword in keywords_to_keep):
            return 1
    return 0

In [3]:
def exclude(row, columns_to_check, keywords_to_exclude):
    for column in columns_to_check:
        value = row[column]
        if any(keyword in str(value).lower() for keyword in keywords_to_exclude):
            return 0
    return 1

Workflow function

In [4]:
def filter(keep_csv_path, excl_csv_path, path_to_class_path):
    keywords_keep = pd.read_csv(keep_csv_path, header=None, index_col=False)
    keywords_exclude = pd.read_csv(excl_csv_path, header=None, index_col=False)
    path_to_class =pd.read_csv(path_to_class_path, index_col=False)

    #convert csvs to lower case of strings 
    keep_l = keywords_keep.values.flatten().tolist()
    excl_l= keywords_exclude.values.flatten().tolist()

    #output records to keep
    columns_to_check = ['Clinical History', 'Gross Description','Final Diagnosis', 
           'Addendum Reason: Additional Interpretive Information',
           'Addendum Reason: Additional Tissue Samples',
           'Addendum Reason: Electron Microscopy Results',
           'Addendum Reason: Hematopathology Consultation',
           'Addendum Reason: Immunohistochemistry Analysis',
           'Addendum Reason: Outside Consultation',
           'Addendum Reason: See Addendum Text Below',
           'Addendum Reason: Send Out Test',
           'Addendum Reason: Special Stains Results',
           'Additional interpretation/findings', 'Chromosome Analysis',
           'Clinical Indication', 'Clinical Information',
           'Clinicopathologic Correlation', 'Code Of Sections', 'Comments',
           'Correction Reason', 
           'Internal Examination', 'Intraoperative Consultation',
           'Microscopic Description', 'Specimen A', 'Specimen B', 'Specimen C',
           'Specimen Information', 'Specimen Source']

    path_to_class['keep_ind']=path_to_class.apply(lambda row: keep(row, columns_to_check, keep_l), axis=1)

        #export lists: keep and not keep under keywords_keep
    path_keep = path_to_class[path_to_class['keep_ind']==1]
    path_not_keep = path_to_class[path_to_class['keep_ind']==0]
    #sanity checks
    if len(path_keep) + len(path_not_keep) == len(path_to_class):
        path_keep.to_csv('output/path_keep=1.csv',index=False)
        path_not_keep.to_csv('output/path_keep=0.csv',index=False)
    else:
        print("Error classifying with keep_keywords!")

    #output records to exclude
    path_keep['keep_ind_post_excl']=path_keep.apply(lambda row: exclude(row, columns_to_check, excl_l), axis=1)
        #export lists: keep and not keep after keywords_keep and keywords_exclude
    path_keep_post_excl_l = path_keep[path_keep['keep_ind_post_excl']==1]
    path_excl_post_excl_l = path_keep[path_keep['keep_ind_post_excl']==0]
    #sanity check
    if len(path_keep_post_excl_l) + len(path_excl_post_excl_l) == len(path_keep):
        path_keep_post_excl_l.to_csv('output/path_keep=1_post_excl_list.csv', index=False)
        path_excl_post_excl_l.to_csv('output/path_keep=0_post_excl_list.csv', index=False)
    else:
        print("Error  classifying with exclude_keywords!")

In [7]:
filter("keywords_keep.csv", "keywords_exclude.csv", "path_to_classify_full_021524.csv")

To sample 

In [8]:
def sample(df, purpose_str, keep_ind, keeps_per_class, nokeeps_per_class, random_state=None):
    keep_df = df[df[keep_ind] == 1]  # Subset where keep_ind is 1
    nokeep_df = df[df[keep_ind] == 0]  # Subset where keep_ind is 0

    sample_1 = keep_df.sample(keeps_per_class, random_state=random_state)  # random_state for reproducibility
    sample_0 = nokeep_df.sample(nokeeps_per_class, random_state=random_state)
    
    sample_full=pd.concat([sample_1, sample_0])[['procedure_name', 'Gross Description', 'Clinical History','Final Diagnosis', keep_ind]]
    sample_full.reset_index(drop=True, inplace=True)
    #export
    sample_full.to_csv('sample_'+purpose_str+str(random_state)+'.csv')

Example

In [9]:
path_keep = pd.read_csv('output/path_keep=1.csv',index_col=0)
sample(path_keep, "keep_pos", 'keep_ind', 100, 0, random_state=1)   #set keeps_per_class=0 if sampling from keep=0
sample_keep_pos_1 = pd.read_csv('sample_keep_pos1.csv',index_col=0)