In [None]:
## this notebook creates a fake dataframe containing randomly generated text (one sentence per record) concerning
## tumors and their respective sizes.  The sentences are parsed to extract measurement data, and the output
## dataframe contains columns intended to mimic data abstracted from the VA Corporate Data Warehouse
## Output dataframe is stored using %store and can be analyzed using the accompanying notebook, plot_nodule_size.ipynb
## J Smith
## Feb 2023

In [None]:
from random import random, randint
import datetime
import re
import pandas as pd


In [None]:
def mknods(exdt = None):
    nnods = randint(1, 3)

    if not exdt:
        exdt = datetime.date(randint(2018, 2022), randint(1, 12), randint(1, 28))
    nods = []
    isnumbered = (random() < 0.15)
    islettered = (random() < 0.15) * (nnods < 5)
    if isnumbered:
        septype = randint(1, 3)
    elif islettered:
        septype = randint(1, 2)

    prevloc = None
    for nnum in range(1, nnods + 1):   
        nd = [1, 1, 1, 1, 2, 2, 2, 3, 3][randint(0, 8)]
        u = ['cm', 'mm', 'mm'][randint(0, 2)]
        xb = ['x', 'x', 'x', 'x', 'x', 'by', 'by'][randint(0, 6)]
        uall = (random() < 0.3)
        sp0 = ['', ' ', ' '][randint(0, 2)] # between N and unit
        sp1 = ['', ' ', ' '][randint(0, 2)] # between unit and 'x'
        sp2 = ['', ' ', ' '][randint(0, 2)] # between 'x' and N
        if uall:
            sp1 = ' '
            if sp0 == ' ':
                sp2 = ' '
        else:
            sp0 = ''
            sp2 = sp1
        if xb == 'by':
            sp2 = ' '
        dims = []
        for i in range(nd):
            if u == 'cm':
                d = randint(10, 60) / 10
                if random() < 0.3:
                    d = int(d)
            else:
                d = randint(8, 40)
            dims.append(str(d))

        if nd > 1:
            # 3cm x 4.5cm x 2cm  (cm x )... 3 x 2 cm ( x ) ... 3x2cm (x) 
            if uall:
                sep = sp0 + u + sp1 + xb + sp2
            else:
                sep = sp1 + xb + sp2
            dimstr = sep.join(dims)
        else:
            dimstr = dims[0]
            
        dimstr = dimstr + sp0 + u
        
        tnm = ['tumor', 'nodule', 'mass', 'lesion'][randint(0, 3)]
        descrips = ['%s measuring ' % tnm, '%s of approx. ' % tnm, '%s approximately ' % tnm, ' %s' % tnm]
        descriptype = randint(0, len(descrips) - 1)
        descrip = descrips[descriptype]
        if descriptype in (0, 1, 2):
            if nnods == 1:
                sing = ['a single ', 'a singular ', 'one '][randint(0, 2)]
            else:
                sing = 'a '
            descriptors = ['a ', 'a ', 'a ', 'an irregular ', 'a concerning ', 
                       'a poorly defined ', 'a well-defined ', 'a fuzzy ', 'a spiculated ',
                       'a suspicious ', 'a likely benign ', 'a possibly benign ', 
                       'a pre-existing ', 'a new ', '%s' % sing]
            descrip = descriptors[randint(0, len(descriptors) - 1)] + descrip
            dimstr = descrip + dimstr
        else:
            # ...a 3cm x 1.5cm tumor [+/- descriptor]
            posslike = ['possibly', 'likely', 'assumed to be'][randint(0, 2)]
            descriptors = ['', '', '', ' with defined borders', ' with irregular borders', 
                          ' %s cancerous' % posslike, ' %s of no concern' % posslike, 
                           ' %s benign' % posslike, ' not previously seen', ' not seen in prior scans']
            descrip = descrip + descriptors[randint(0, len(descriptors) - 1)]
            dimstr = 'a ' + dimstr + descrip
        
        s = ''
        if isnumbered:
            if septype == 1:
                s = ' %s. ' % str(nnum)
            elif septype == 2:
                s = ' %s) ' % str(nnum)
            else:
                s = ' (%s) ' % str(nnum)
        elif islettered:
            if septype == 1:
                s = ' %s) ' % {1: 'a', 2: 'b', 3: 'c', 4: 'd'}[nnum]
            else:
                s = ' (%s) ' % {1: 'a', 2: 'b', 3: 'c', 4: 'd'}[nnum]
        
        elif nnum > 1:
            if nnum < nnods:
                s = ', '
            else:
                s = ' and '
        
        dimstr = s + dimstr
        qspec = (random() < 0.3)
        if qspec:
            qloc = '%s%s %s' % (['upper', 'lower'][randint(0, 1)], [' left', ' right'][randint(0, 1)], 
                              ['lobe', 'lobe', 'lobe', 'quadrant', 'quadrant', 'quad.', 'corner'][randint(0, 6)])
            sameloc = False
            if qloc == prevloc:
                sameloc = True
            prevloc = qloc
            if random() < 0.6:
                qloc = re.sub('upper', 'U', qloc)
                qloc = re.sub('lower', 'L', qloc)
                qloc = re.sub(' left', 'L', qloc)
                qloc = re.sub(' right', 'R', qloc)
                if 'lobe' in qloc:
                    if random() < 0.3:
                        qloc = re.sub(' lobe', 'L', qloc)
                        
            dimstr = dimstr + ' in the %s' % qloc
            if sameloc:
                if random() < 0.7:
                    dimstr = dimstr + ', %s the previous %s' % (['above', 'below', 'adjacent to', 
                                                                'to the right of', 'to the left of',
                                                                'anterior to', 'posterior to', 
                                                                'contiguous with'][randint(0, 7)], 
                                                               ['mass', 'finding', 'nodule', 'lesion'][randint(0, 3)])
        
        
        nods.append(dimstr)
    
    imtypes = ['PET scan', 'PET', 'CT', 'CT', 'CT scan', 'CT exam', 'imaging', 'imaging exam', 'imaging study']
    imtype = imtypes[randint(0, len(imtypes) - 1)]
    if 'CT' in imtype or 'imaging' in imtype:
        if random() < 0.3:
            if random() < 0.5:
                imtype = 'contrast ' + imtype
            elif random() < 0.5:
                imtype = imtype + ' with contrast'
            else:
                imtype = imtype + 'w/o contrast'
            if 'with' in imtype and random() < 0.3:
                imtype = re.sub('with', 'w.', imtype)
    fdt = ''
    fdt2 = ''
    if random() < 0.3:
        fdt = 'from %s' % str(exdt)
        fdt2 = ' performed on %s' % str(exdt)
        if random() < 0.5:
            fdt2 = ' performed %s' % str(exdt)
    imtype = imtype + fdt2 + [' revealed', ' reveals', ' showed', ' shows'][randint(0, 3)]
    intros = ['Findings %swere as follows: ' % (' ' + fdt + ' '), imtype]
    
    nods_sent = intros[randint(0, len(intros) - 1)] + ' ' + ''.join(nods) + '.'
    nods_sent = nods_sent[0].upper() + nods_sent[1:]
    nods_sent = re.sub('\s+', ' ', nods_sent)
    nods_sent = re.sub('\.\.', '\.', nods_sent)
    
    return nods_sent

In [None]:
t = mknods()
print (t)

In [None]:
def parseMeasure(searchpattern, insent, maxdims = 3):
    rawitems = re.findall(searchpattern, insent)
    allmms = []
    for n, raw in enumerate(rawitems):
        raw = re.sub('[^a-zA-Z0-9.]+', '', raw)
        raw = re.sub('by', 'x', raw)
        msplit = raw.split('x')
        if len(msplit) > maxdims:
            msplit = msplit[-maxdims:]
        mmlist = [n + 1]
        mmlist.extend(None for i in range(maxdims))
        for i in range(len(msplit)-1,max(len(msplit)-maxdims-1, -1),-1):
            if 'mm' in msplit[i]: 
                mtype = 'mm'
                msplit[i] = msplit[i][:-2]
            if 'cm' in msplit[i]: 
                mtype = 'cm'
                msplit[i] = msplit[i][:-2]
            mmlist[i+1] = float(msplit[i])
            if mtype == 'cm':
                mmlist[i+1] = mmlist[i+1] * 10
        allmms.append(mmlist)
    return allmms

In [None]:
# testing the sentence generating function (mknods) and the sentence parsing function (parseMeasure)...

# regex pattern for finding measurements
findmmcm = re.compile('((?:\d*\.?\d+\s*(?:mm|cm)*\s*(?:x|by)\s*)*\d*\.?\d+\s*(?:mm|cm))')

s = mknods()
print (s)

# first item in each sub-list is the item (tumor) number, the last 3 are measurements for that tumor
parseMeasure(findmmcm, s, maxdims = 3)

In [None]:

findmass = re.compile(r'tumor|nodule|lesion|\bmass\b|spiculate|opacity')
nonlung = re.compile(r'liver|hepat|kidney|\brenal\b')
replxwalk = [('centimeters?', 'cm'), ('millimeters?', 'mm'), (r'\r\n|\n|\r', ' '), 
            (',', ''), (r'\s+', ' ')]
anymeasure = re.compile('\d+\s?(mm|cm)')

findmmcm = re.compile('((?:\d*\.?\d+\s*(?:mm|cm)*\s*(?:x|by)\s*)*\d*\.?\d+\s*(?:mm|cm))')

# number of patients to create (people can have multiple reports and multiple sentences of interest)
npts = 10000

cols = ['patientICN', 'examDateTime', 'radsrc', 'radNucMedReportSID', 'lungdxdt_minus_repdt', 
       'age', 'sentID', 'item', 'dim1mm', 'dim2mm', 'dim3mm', 'sent']

lcdata = []

usedSIDs = []

stopoutput = False

for ptnum in range(1, npts + 1):
    if stopoutput:
        break
    lungdxdt = datetime.datetime(2018, 3, 1) + datetime.timedelta(days = randint(0, 1640))
    nreps = randint(1, 10)
    ptICN = str(1000000000 + ptnum)
    for rnum in range(1, nreps + 1):
        nsents = randint(1, 5)
        edt = datetime.datetime(2017, 9, 1) + datetime.timedelta(days = randint(0, 1640))
        radsrc = ['rad', 'imp'][(random() < 0.4)]
        ntries = 0
        notfound = True
        while ntries < 3:
            ntries += 1
            rnmSID = 1000000000 + randint(1, 10000000)
            if rnmSID not in usedSIDs:
                notfound = False
                usedSIDs.append(rnmSID)
                break
        if notfound:
            stopoutput = True
            print ('stopped at pt # %s' % ptnum)
            break
        lmr = (lungdxdt - edt).days
        age = randint(55, 80)
        for snum in range(1, nsents + 1):
            sent = mknods(edt.date())
            sdata = parseMeasure(findmmcm, sent, maxdims = 3)
            nitems = len(sdata)
            for nod in sdata:
                inum, d1, d2, d3 = nod
                lcrow = [ptICN, edt, radsrc, rnmSID, lmr, age, snum, inum, d1, d2, d3, sent]
                lcdata.append(lcrow)
                
print ('...done')
print ('...lcdata contains %s rows' % len(lcdata))

In [None]:
import pandas as pd

In [None]:
# convert list of lists to pandas dataframe 

lcdf = pd.DataFrame(lcdata, columns = cols)

In [None]:
lcdf.head()
lcdf.describe()

In [None]:
%store lcdf

# ^^ this makes the dataframe available to another notebook (presumably running on same kernel?)

In [None]:

### ALL GARBAGE BELOW THIS POINT


In [None]:
dt = datetime.datetime(2020, 5, 27)
dtplus3 = dt + datetime.timedelta(days = 3)
print (dtplus3)

diff = (dtplus3 - dt)
print (type(diff))

print (dir(diff))

print (diff.days)

In [None]:
lungdxdt = datetime.datetime(2018, 3, 1) + datetime.timedelta(days = randint(0, 1640))

lungdxdt.date()

In [None]:
len(lcdata)

In [None]:
lcdata[0]

In [None]:
for row in lcdata[:6]:
    print('\t'.join(map(str, row)))

In [None]:

c = ['Tara', 18898918, datetime.datetime(2017, 5, 17)]

In [None]:
str(c[1])

In [None]:
c2 = ['ab', 'cde', 'fg', 6]

# concatenating elements of the list c2 into a string separated by hyphens

## using iteration
c2s = '-'.join([str(x) for x in c2])

## same thing, using vectorization
c2s2 = '-'.join(map(str, c2))



print(c2s)
print(c2s2)

In [None]:
# example function

def getmm(m):
    '''
    this function 
    returns # of mm given a string 
    containing a measurement in mm or cm
    input should look like either '15 mm' or '3.6 cm'
    '''
    
    as_list = m.split(' ')
    n = float(as_list[0])
    unit = as_list[1]

    if unit == 'cm':
        n = n * 10

    return n


s = '14.7 cm'

print (37 + getmm(s) / 2)
        
        
ms = ['112 cm', '22 mm', '13.7 cm']

print (list(map(getmm, ms)))

for i in ms:
    print (getmm(i))

# look at the docstring 
help(getmm)
        

In [None]:
help(median)