In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [2]:
path_reports = pd.read_csv("Path Reports.csv")
split_reps = pd.read_csv("Chunked Path Reports.csv").drop("Unnamed: 0", axis=1)

In [3]:
# natural language processing tools

# Negation Processing
def negate_sequence(tokens):
    """
    Detects negations and transforms negated words into "not_" form.
    """
    negation = False
    delims = "?.,!:;"
    result = []
    
    for token in tokens:
        if any(c == token for c in delims):
            negation = False
        
        negated = "not_" + token if negation else token
        result.append(negated)

        if any(neg == token.lower() for neg in ["not", "n't", "no"]):
            negation = not negation   

    return result

## Classify Each Biopsy as Invasive or Not

In [4]:
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast


In [5]:
def classify_invasive(pathRep):
    pathRep = ' '.join(negate_sequence(nltk.word_tokenize(pathRep.lower())))
    obs = re.split(re.compile(r"[0-9]\."), pathRep)
    if len(obs) > 1:
        obs = obs[1:]
    
    obs = [' ' + ob for ob in obs] # pad with spaces to match markers that come at beginning of strings
    
    classes = []
    for ob in obs:
        inv_marker = re.compile(" invasive| infiltrating")
        ben_marker = re.compile(" in situ| benign| fibroadenoma")
        if re.search(inv_marker, ob):
            classes.append("invasive")
        elif re.search(ben_marker, ob):
            classes.append("benign")
        else:
            classes.append("normal")
    return classes

In [6]:
re.split(re.compile(r"[0-9]\."), "3. hello")

['', ' hello']

In [7]:
split_reps["Invasive/Benign"] = split_reps["Path Report"].apply(classify_invasive)
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,Invasive/Benign
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[invasive, benign, benign, normal, normal]"
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal]
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal]
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal]
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[normal, normal, normal]"
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[invasive, benign]"
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,[benign]
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,[normal]
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,[normal]
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[normal, normal]"


In [8]:
split_reps.iloc[0]["Path Report"]

'1. Invasive lobular carcinoma in a background of lobular carcinoma in situ, negative margins; see comment. 2. Ductal carcinoma in situ, grade 2, negative margins; see comment. 3. Hyalinized fibroadenoma, radial scar and fibrocystic changes. 4. Biopsy site changes. 5. Nipple with no significant pathologic abnormality.'

In [9]:
num_inv = sum(["invasive" in split_rep for split_rep in split_reps["Invasive/Benign"]])
num_ben = sum(["benign" in split_rep for split_rep in split_reps["Invasive/Benign"]])
num_norm = sum(["normal" in split_rep for split_rep in split_reps["Invasive/Benign"]])
num_inv, num_ben, num_norm

(692, 2961, 5253)

In [10]:
def classify_distribution(pathRep):
    pathRep = ' '.join(negate_sequence(nltk.word_tokenize(pathRep.lower())))
    obs = re.split(re.compile(r"[0-9]\."), pathRep)
    if len(obs) > 1:
        obs = obs[1:]
    obs = [' ' + ob for ob in obs] # pad with spaces to match markers that come at beginning of strings
    
    classes = []
    
    lob_marker = re.compile(" lobul[a-zA-Z]*")
    duct_marker = re.compile(" duct[a-zA-Z]*")
    for ob in obs:
        
        if re.search(lob_marker, ob) and re.search(duct_marker, ob):
            classes.append("lobular/ductal")
        elif re.search(lob_marker, ob):
            classes.append("lobular")
        elif re.search(duct_marker, ob):
            classes.append("ductal")
        else:
            classes.append("none")
    return classes

In [11]:
split_reps["Distribution"] = split_reps["Path Report"].apply(classify_distribution)
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,Invasive/Benign,Distribution
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[invasive, benign, benign, normal, normal]","[lobular, ductal, none, none, none]"
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal],[none]
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal],[none]
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal],[none]
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[normal, normal, normal]","[none, none, none]"
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[invasive, benign]","[ductal, ductal]"
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,[benign],[none]
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,[normal],[none]
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,[normal],[none]
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[normal, normal]","[ductal, none]"


In [12]:
duct_marker = re.compile(" duct[a-zA-Z]*")
re.search(duct_marker, 'ductal carcinoma in situ, nuclear grade 2-3; see comment.')#, split_reps.iloc[7017]["Invasive/Benign"]

In [13]:
def summarize_cancer_label(type_lst, dist_lst):
    if "invasive" in type_lst:
        inds = [i for i in range(len(type_lst)) if type_lst[i] == "invasive"]
        dists = [dist_lst[i] for i in inds if dist_lst[i] != "none"]
        labels = ["invasive " + dist + " carcinoma" for dist in dists]
        return set(labels)
    elif "benign" in type_lst:
        inds = [i for i in range(len(type_lst)) if type_lst[i] == "benign"]
        dists = [dist_lst[i] for i in inds if dist_lst[i] != "none"]
        labels = ["benign " + dist + " carcinoma" for dist in dists]
        return set(labels)
    else:
        return set(["normal"])

In [14]:
labels = []
for i in range(split_reps.shape[0]):
    type_lst = split_reps.iloc[i]["Invasive/Benign"]
    dist_lst = split_reps.iloc[i]["Distribution"]
    labels.append(summarize_cancer_label(type_lst, dist_lst))
split_reps["Summarized Labels"] = labels
split_reps

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source,Invasive/Benign,Distribution,Summarized Labels
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast,"[invasive, benign, benign, normal, normal]","[lobular, ductal, none, none, none]",{invasive lobular carcinoma}
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal],[none],{normal}
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal],[none],{normal}
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node,[normal],[none],{normal}
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast,"[normal, normal, normal]","[none, none, none]",{normal}
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast,"[invasive, benign]","[ductal, ductal]",{invasive ductal carcinoma}
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast,[benign],[none],{}
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus,[normal],[none],{normal}
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin,[normal],[none],{normal}
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast,"[normal, normal]","[ductal, none]",{normal}


In [17]:
ts = []
for rep in split_reps["Path Report"]:
    ts.append(re.search("metast", rep))
split_reps[ts]

KeyError: '[None None None ..., None None None] not in index'