In [120]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [121]:
data = pd.read_csv("Mammogram Path Reports.csv")
data.columns = ["Path Report", "Label"]
numPatients = data.shape[0]
path_reports = pd.read_csv("Path Reports Complete.csv")
split_reps = pd.read_csv("Path Reports (By Specimens).csv").drop("Unnamed: 0", axis=1)

In [122]:
# natural language processing tools

# Negation Processing
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.,!:;"
    result = []
    
    for token in tokens:
        if any(c == token for c in delims):
            negation = False
        
        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no"]):
            negation = not negation   

    return result

In [123]:
re.compile(r'(\.|\;|\-|\,)').sub(' ', 'hello.my;name-is')

'hello my name is'

## Classify Each Biopsy according to Path Tree

In [124]:
#split_reps["metastasis" in split_reps["Single Label"]]

In [163]:
def classify_regex(pathRep, organ):
    pathRep = ' '.join(negate_sequence(nltk.word_tokenize(pathRep.lower())))
    obs = re.split(re.compile(r"[0-9]\. "), pathRep)
    if len(obs) > 1:
        obs = obs[1:]
    
    obs = [re.compile(r'(\.|\;|\-|\,)').sub(' ', ' ' + ob + ' ') for ob in obs] 
    # pad with spaces to match markers that come at beginning of strings and remove dashes and commas
    classes = []
    for ob in obs:
        classes.append(classify_breast_ob(ob, organ))
    return classes

atyp_markers = ["flat epithelial atypia", 
                "atypical ductal hyperplasia", 
                "atypical lobular hyperplasia"]

fibro_markers = ["fibroadenoma", "phyllodes"]

ben_markers = ["papilloma", "usual ductal hyperplasia", 
                "apocrine metaplasia", "radial scar",
                "sclerosing adenosis", 
                "pseudoangiomatous stromal hyperplasia",
                "cyst", "mastitis"]

def classify_breast_ob(ob, organ):
    label = ["na", "na", "na"]

    # Lymphoma
    if re.search(" lymphoma", ob):
        label[0] = "lymphoma"
        return label
    
    

    # Breast Cancer and Metastases
    if re.search(" lcis", ob):
        label = ["breast cancer", "lobular", "in situ"]
        return label
    elif re.search(" dcis", ob):
        label = ["breast cancer", "ductal", "in situ"]
        return label
    elif re.search(" idc", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
    elif re.search(" ilc", ob):
        label = ["breast cancer", "ductal", "invasive"]
        return label
        
    
    if re.search(" (adeno)?carcinoma| breast cancer", ob):
        label[0] = "breast cancer"
        if re.search(" metastati(c|s)", ob):
            #if organ == "breast":
            
            label[1] = "metastasis"
            #else:
            #    label[0] = "metastasis from non-bc"
        else:
            if re.search(re.compile("[ -]invasive|[ -]infiltrating"), ob):
                label[1] = "invasive"
            elif re.search(re.compile("in[ -]situ"), ob):
                label[1] = "in situ"

            if re.search(" duct(al)?", ob):
                label[2] = "ductal"
            elif re.search("lobular", ob):
                label[2] = "lobular"
        return label
    
    

    # Atypical
    for marker in atyp_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "atypical"
            label[1] = marker
            return label
    
    # Fibroepithelial
    for marker in fibro_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "fibroepithelial"
            label[1] = marker
            return label
            
    # Benign
    for marker in ben_markers:
        if re.search(" " + marker + " ", ob):
            label[0] = "benign"
            label[1] = marker
            return label
            
    return label
            
    #lymph_marker = re.compile(" lymphoma")
    #mets_marker = re.compile(" metastasis")
    #inv_breast_marker = [re.compile(marker) for marker in []]
    

In [164]:
split_reps[["Biopsy Source", "Path Report"]].apply(lambda x: print(type(x)))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Biopsy Source    None
Path Report      None
dtype: object

In [165]:
labels = []
for i in range(split_reps.shape[0]):
    labels.append(classify_regex(split_reps.iloc[i]["Path Report"], split_reps.iloc[i]["Biopsy Source"]))
split_reps["All Labels"] = labels
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels,Single Label
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[[breast cancer, invasive, lobular], [breast c...","(breast cancer, invasive, lobular)"
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[[benign, papilloma, na], [benign, radial scar...","(na, na, na)"
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[[breast cancer, invasive, ductal], [breast ca...","(breast cancer, invasive, ductal)"
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,"[[na, na, na]]","(na, na, na)"
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,"[[na, na, na]]","(na, na, na)"
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,"[[na, na, na]]","(na, na, na)"
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[[na, na, na], [na, na, na]]","(na, na, na)"


In [166]:
split_reps.iloc[5]["Path Report"]

'1. Invasive ductal carcinoma; see comment. 2. Focal ductal carcinoma in situ, intermediate nuclear grade, solid pattern; see comment.'

In [167]:
split_reps.iloc[5]["All Labels"]

[['breast cancer', 'invasive', 'ductal'],
 ['breast cancer', 'in situ', 'ductal']]

In [168]:
def get_single_label(obs_labels):
    label = ["na", "na", "na"]
    first_level = [labels[0] for labels in obs_labels]
    second_level = [labels[1] for labels in obs_labels]
    
    try: 
        if "lymphoma" in first_level:
            label[0] = "lymphoma"
        #elif "metastasis from non-bc" in first_level:
        #    label[0] = "metastasis from non-bc"
        elif "breast cancer" in first_level:
            label[0] = "breast cancer"

            third_level = [labels[2] for labels in obs_labels]
            if "metastasis" in second_level:
                label[1] = "metastasis"
            else:
                if "invasive" in second_level:
                    label[1] = "invasive"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "invasive"]
                elif "in situ" in second_level:
                    label[1] = "in situ"
                    third_level = [third_level[ind] for ind in range(len(obs_labels)) 
                                   if second_level[ind] == "in situ"]

                if "ductal" in third_level:
                    label[2] = "ductal"
                elif "lobular" in third_level:
                    label[2] = "lobular"
        elif "atypical" in first_level:
            label[0] = "atypical"
            for marker in atyp_markers:
                if marker in second_level:
                    label[1] = marker
        elif "fibroepithelial" in first_level:
            label[0] = "atypical"
            for marker in fibro_markers:
                if marker in second_level:
                    label[1] = marker
        elif "benign" in first_level:
            label[0] = "benign"
            for marker in ben_markers:
                if marker in second_level:
                    label[1] = marker
    except:
        print(obs_labels)
                
    return tuple(label)

In [169]:
split_reps["Single Label"] = split_reps["All Labels"].apply(get_single_label)
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels,Single Label
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[[breast cancer, invasive, lobular], [breast c...","(breast cancer, invasive, lobular)"
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,"[[na, na, na]]","(na, na, na)"
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[[benign, papilloma, na], [benign, radial scar...","(benign, radial scar, na)"
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[[breast cancer, invasive, ductal], [breast ca...","(breast cancer, invasive, ductal)"
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,"[[na, na, na]]","(na, na, na)"
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,"[[na, na, na]]","(na, na, na)"
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,"[[na, na, na]]","(na, na, na)"
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[[na, na, na], [na, na, na]]","(na, na, na)"


In [170]:
labeled_split_reps = split_reps.drop("Rad Label", axis=1).rename(index=str, 
    columns={"Patient": "Patient ID", 
             "Biopsy Description": "Path Report I",
             "Path Report": "Path Report II",
             "Laterality": "Laterality [Derived]",
             "Biopsy Source": "Organ [Derived]",
             "All Labels": "All Labels [Derived]",
             "Single Label": "Single Label [Derived]"})

In [171]:
labeled_split_reps[labeled_split_reps["Patient ID"] == 10]

Unnamed: 0,Patient ID,Path Report I,Path Report II,Laterality [Derived],Organ [Derived],All Labels [Derived],Single Label [Derived]
30,10,"Right breast, needle core biopsy",1. Stromal fibrosis. 2. No evidence of in situ...,right,breast,"[[na, na, na], [na, na, na]]","(na, na, na)"
31,10,"Left breast, needle core biopsy","1. Invasive ductal carcinoma, SBR grade 2. 2. ...",left,breast,"[[breast cancer, invasive, ductal], [na, na, na]]","(breast cancer, invasive, ductal)"


In [172]:
def get_binary_label(patient_data):
    laterality = []
    for i, specimen in patient_data.iterrows():
        if specimen["Single Label [Derived]"][0] == "breast cancer" or specimen["Single Label [Derived]"][0] == "lymphoma":
            laterality.append(specimen["Laterality [Derived]"])
    if "right" in laterality and "left" in laterality:
        return "Bilateral Positive"
    elif "right" in laterality:
        return "Right Positive"
    elif "left" in laterality:
        return "Left Positive"
    elif laterality:
        return "Positive NOS"
    else:
        return "Negative"

def flatten_list(nestedl):
    [item for sublist in nestedl for item in sublist]

patient_labels, binary_labels = [], []
for patID in range(numPatients):
    reps = labeled_split_reps[(labeled_split_reps["Patient ID"] == patID)]
    labels = reps["Single Label [Derived]"].tolist()
    single_label = get_single_label(labels)
    patient_labels.append(single_label)
    binary_labels.append(get_binary_label(reps))
    
    

labeled_data = data
labeled_data["Single Label"] = patient_labels
labeled_data["Binary Label [Derived]"] = binary_labels
labeled_data

Unnamed: 0,Path Report,Label,Single Label,Binary Label [Derived]
0,"A. Breast, left, simple mastectomy: 1. Invasiv...",Left Positive,"(breast cancer, invasive, lobular)",Left Positive
1,"A. Left breast, ""mass at 12 o'clock 3 cm from ...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
2,"A. Right axillary contents, excision: No carci...",Negative,"(na, na, na)",Negative
3,"Right breast, excision of mammographic lesion:...",Right Positive,"(breast cancer, na, na)",Right Positive
4,"A. Sentinel lymph node #1, left axilla, biopsy...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
5,"A. Left breast, biopsy: 1. Infiltrating ductal...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
6,"Left breast, 9:30, needle core biopsy: Invasiv...",Left Positive,"(breast cancer, invasive, ductal)",Left Positive
7,"A. Left breast, mastectomy: 1. Ductal carcinom...",Left Positive,"(breast cancer, in situ, ductal)",Left Positive
8,"A. Lymph node, right axillary, sentinel node #...",Right Positive,"(breast cancer, in situ, ductal)",Right Positive
9,"Breast, left, ""12 o'clock,"" biopsy: Pleomorphi...",Left Positive,"(breast cancer, in situ, lobular)",Left Positive


In [173]:
labeled_data[labeled_data["Label"] != labeled_data["Binary Label [Derived]"]]

Unnamed: 0,Path Report,Label,Single Label,Binary Label [Derived]
28,"A. Breast, needle-localization excision biopsy...",Right Positive,"(breast cancer, invasive, ductal)",Positive NOS
667,"Skin, right breast, excision: 1. 1 mm microsco...",Right Positive,"(na, na, na)",Negative
669,CONSULT SLIDE FROM WESTERN PATHOLOGY CONSULTAN...,Negative,"(breast cancer, invasive, na)",Left Positive
1105,"Right Breast, Fine Needle Aspiration: Atypical...",Right Positive,"(na, na, na)",Negative
1160,"BREAST, RIGHT, FINE NEEDLE ASPIRATION: Monoton...",Right Positive,"(na, na, na)",Negative
1178,"Right Breast, Fine Needle Aspiration: Atypical...",Right Positive,"(na, na, na)",Negative
1225,"Left breast, segmental resection: 1. Infiltrat...",Left Positive,"(benign, cyst, na)",Negative
1256,"A. Left breast, outer central, biopsy. 1. Lobu...",Left Positive,"(na, na, na)",Negative
1471,"Left breast, biopsy: Breast tissue with a minu...",Negative,"(breast cancer, na, lobular)",Left Positive
1630,"A. Right breast, radical mastectomy: 1. Previo...",Negative,"(breast cancer, metastasis, na)",Right Positive


In [177]:
ob = labeled_data.iloc[3243]["Path Report"]

classify_breast_ob(ob, "breast")

['fibroepithelial', 'phyllodes', 'na']

In [178]:
labeled_split_reps.to_csv("Labeled Path Reports (Specimen).csv")
labeled_data.to_csv("Labeled Path Reports (Entire Report).csv")

In [176]:
split_reps.groupby("Single Label").agg(['count'])

Unnamed: 0_level_0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels
Unnamed: 0_level_1,count,count,count,count,count,count,count
Single Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
"(atypical, atypical ductal hyperplasia, na)",76,76,76,76,76,76,76
"(atypical, atypical lobular hyperplasia, na)",90,90,90,90,90,90,90
"(atypical, fibroadenoma, na)",320,320,320,320,320,320,320
"(atypical, flat epithelial atypia, na)",35,35,35,35,35,35,35
"(atypical, phyllodes, na)",11,11,11,11,11,11,11
"(benign, apocrine metaplasia, na)",176,176,176,176,176,176,176
"(benign, cyst, na)",236,236,236,236,236,236,236
"(benign, mastitis, na)",5,5,5,5,5,5,5
"(benign, papilloma, na)",50,50,50,50,50,50,50
"(benign, pseudoangiomatous stromal hyperplasia, na)",35,35,35,35,35,35,35


In [106]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    display(x)
    pd.reset_option('display.max_rows')
split_reps[split_reps["Single Label"] == ("benign", "cyst", "na")]#["Path Report"][4023]

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,All Labels,Single Label


In [107]:
obs = re.split(re.compile(r"[0-9]\."), split_reps[split_reps["Single Label"] == ("na", "na", "na")]["Path Report"][21].lower())

KeyError: 21

In [None]:
obs = [(' ' + ob).replace('-', ' ').replace(',', ' ') for ob in obs] 
classify_breast_ob(obs[0], "breast")

In [None]:
data