In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from matplotlib import style
import re
import difflib as dl
import nltk
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("Mammogram Path Reports.csv")
data.columns = ["Path Report", "Label"]
numPatients = data.shape[0]

path1 = data["Path Report"][0]

In [3]:
letters = "A[.:] | B[.:] | C[.:] | D[.:] | E[.:] | F[.:] | G[.:] | H[.:] | I[.:] | J[.:] | K[.:] "

In [4]:
splitReps = []
for pathRep in data["Path Report"]:
    splitRep = re.split(letters, pathRep)
    if len(splitRep) > 1:
        splitRep = splitRep[1:]
    splitReps.append(splitRep)

In [6]:
data["Split Path Report"] = splitReps
data

Unnamed: 0,Path Report,Label,Split Path Report
0,"A. Breast, left, simple mastectomy: 1. Invasiv...",Left Positive,"[Breast, left, simple mastectomy: 1. Invasive ..."
1,"A. Left breast, ""mass at 12 o'clock 3 cm from ...",Left Positive,"[Left breast, ""mass at 12 o'clock 3 cm from ni..."
2,"A. Right axillary contents, excision: No carci...",Negative,"[Right axillary contents, excision: No carcino..."
3,"Right breast, excision of mammographic lesion:...",Right Positive,"[Right breast, excision of mammographic lesion..."
4,"A. Sentinel lymph node #1, left axilla, biopsy...",Left Positive,"[Sentinel lymph node #1, left axilla, biopsy: ..."
5,"A. Left breast, biopsy: 1. Infiltrating ductal...",Left Positive,"[Left breast, biopsy: 1. Infiltrating ductal c..."
6,"Left breast, 9:30, needle core biopsy: Invasiv...",Left Positive,"[Left breast, 9:30, needle core biopsy: Invasi..."
7,"A. Left breast, mastectomy: 1. Ductal carcinom...",Left Positive,"[Left breast, mastectomy: 1. Ductal carcinoma ..."
8,"A. Lymph node, right axillary, sentinel node #...",Right Positive,"[Lymph node, right axillary, sentinel node #1,..."
9,"Breast, left, ""12 o'clock,"" biopsy: Pleomorphi...",Left Positive,"[Breast, left, ""12 o'clock,"" biopsy: Pleomorph..."


In [8]:
biopData = pd.DataFrame(columns = ["Patient", "Biopsy Description", "Path Report", "Rad Label", "Laterality"])

In [9]:
patIds = range(data.shape[0])
bioType, pathRep, patients, labels = [], [], [], []

for patId in patIds:
    patient = data.iloc[patId]
    for rep in patient[2]:
        #split report into biopsy description and path report
        splitRep = re.split(re.compile(r"\: |\. "), rep)
        
        if len(splitRep) > 1:
            bioType.append(splitRep[0])
            pathRep.append('. '.join(splitRep[1:]))
            patients.append(patId)
            labels.append(patient["Label"])
            
biopData["Patient"] = patients
biopData["Biopsy Description"] = bioType
biopData["Path Report"] = pathRep
biopData["Rad Label"] = labels

In [10]:
# Extracting laterality, biopsy source, and labels
lats, organs = [], []
for biop in biopData["Biopsy Description"]:
    biop = nltk.word_tokenize(biop.lower())

    if len(dl.get_close_matches("left", biop)) > 0:
        lats.append("left")
    elif len(dl.get_close_matches("right", biop)) > 0:
        lats.append("right")
    else:
        lats.append("na")

    if len(dl.get_close_matches("breast", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("nipple", biop)) > 0:
        organs.append("breast")
    elif len(dl.get_close_matches("lymph", biop)) > 0:
        organs.append("lymph node")
    elif len(dl.get_close_matches("skin", biop)) > 0:
        organs.append("skin")
    elif len([word for wordList in [dl.get_close_matches(br, biop) 
            for br in ["axilla", "uterus", "fallopian", "ovary", "adnexa"]] 
              for word in wordList]) > 0:
        organs.append("uterus")
    else:
        organs.append("na")
biopData["Laterality"] = lats
biopData["Biopsy Source"] = organs

numSamples = biopData.shape[0]

biopData

Unnamed: 0,Patient,Biopsy Description,Path Report,Rad Label,Laterality,Biopsy Source
0,0,"Breast, left, simple mastectomy",1. Invasive lobular carcinoma in a background ...,Left Positive,left,breast
1,0,"Lymph node, sentinel node #1, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
2,0,"Lymph node, sentinel node #2, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
3,0,"Lymph node, nonsentinel node #3, biopsy",No tumor in one lymph node (0/1).,Left Positive,na,lymph node
4,0,"Breast, right, simple mastectomy",1. Sclerosing papilloma; see comment. 2. Radia...,Left Positive,right,breast
5,1,"Left breast, ""mass at 12 o'clock 3 cm from nip...",1. Invasive ductal carcinoma; see comment. 2. ...,Left Positive,left,breast
6,1,"Right breast, 10 o'clock 4 cm from nipple, nee...",Dense sclerotic fibrous tissue with scant beni...,Left Positive,right,breast
7,2,"Right axillary contents, excision",No carcinoma in 10 lymph nodes (0/10).,Negative,right,uterus
8,2,"Skin, right axilla, excision",Intradermal nevus.,Negative,right,skin
9,2,"Right nipple, excision",1. Skin of nipple with underlying lactiferous ...,Negative,right,breast


In [13]:
data.to_csv("Path Reports Complete.csv")
biopData.to_csv("Path Reports (By Specimens).csv")