In [1]:
import numpy
import pandas
import nltk
pandas.set_option('display.max_columns', 50)

In [2]:
fake = pandas.read_csv('fake_data.csv', delimiter=',')
fake[:5]

Unnamed: 0,incident_id,event_type,incident_description,descriptor,coordinator_comments,investigation_narrative,acute_medical_harm,dosimetric_impact,latent_medical_harm,functional_work_area,date_incident_detected,time_period_detected,date_incident_occurred,time_period_occurred,individual_detected,individual_involved,patient_month_birth,patient_year_birth,patient_gender,diagnosis,process_step_detected,process_step_occurred,problem_type,contributing_factors,number_patients_affected,radiation_treatment_techniques,total_dose_prescribed,number_fractions_prescribed,number_fractions_incorrect,hardware_manufacturer_model,software_manufacturer_model,body_region_treated,treatment_intent,ameliorating_actions,safety_barriers_failed,safety_barriers_prevented,actions_reduce_risk
0,123488,Actual incident,Patient was scheduled for initial treatment fr...,Plan was an hour and a half late,,Not enough time was provided for planning to c...,,,No,TrueBeam 3,2017-09-14,14:00-15:59,2017-09-14,12:00-15:59,Radiation Therapist,Treatment planner or dosimetrist,January,1950,Female,Breast cancer,Treatment delivery,Treatment planning,Other,Human resources inadequate & Not followed,1,External beam photon radiotherapy - 3D conformal,40.0,15.0,0.0,,,Thorax,Curative (Radical),Other,,,Increase staffing levels or decrease workload
1,123489,Actual incident,A patient was to be treated for cord compressi...,Lost mask led to postponing treatment. Patient...,,Patient's replan should have been made higher ...,Severe,,No,STx 1,2017-09-14,10:00-11:59,2017-09-01,12:00-15:59,Patient & Family member,Radiation therapist & Radiation oncologist,January,1950,Female,Central nervous system tumors,On-treatment quality management,Treatment delivery,Wrong treatment accessories,Human resources inadequate & Capital resources...,1,External beam photon radiotherapy - Modulated ...,60.0,30.0,0.0,,,Spine,Palliative,Medical management of patient injury & Radiati...,Verification of treatment accessories,,Process standardization & Increase staffing le...
2,123490,Actual incident,Imaging guidelines were not respected: (1) KV/...,Imaging guidelines not followed: excessive ima...,,IGRT guidelines were not followed. Technologis...,,,No,TrueBeam 4,2017-08-23,16:00-19:59,2017-08-23,12:00-15:59,Radiation Therapist,Radiation therapist,January,1950,Male,Gastrointestinal cancer,On-treatment quality management,Treatment delivery,Other,Not followed,1,External beam photon radiotherapy - Modulated ...,25.0,5.0,3.0,,,Pelvis,Curative (Radical),Education or training,Review of portal or CBCT images,,Improved compliance with existing policies or ...
3,123491,Near miss,CT sim setup sheet was missing information per...,Information missing from ctsim s/u sheet,,Presence of a mattress was not indicated on th...,,,,TrueBeam 3,2017-09-02,12:00-13:59,2017-08-30,08:00-11:59,Radiation Therapist,Radiation therapist,January,1950,Female,Breast cancer,On-treatment quality management,Imaging for radiotherapy planning,Wrong treatment accessories,"Documentation poor, incomplete, unclear or mis...",0 (Reportable circumstance / Near miss),External beam photon radiotherapy - 3D conformal,,,,,,Thorax,,Staff debriefing or counselling,Verification of treatment accessories,Image-based patient position verification,
4,123492,Actual incident,"During chart QA, it was noticed that a 0.3cm b...",Bolus forgotten 2/10 fx,,Importance of verifying all treatment accessor...,,Minor,No,TrueBeam 5,2017-08-16,16:00-19:59,2017-08-15,12:00-15:59,Radiation Therapist,Radiation therapist,January,1950,Female,Breast cancer,On-treatment quality management,Treatment delivery,Wrong treatment accessories,"Not followed & Documentation poor, incomplete,...",1,External beam photon radiotherapy - 3D conformal,50.0,25.0,2.0,,,Thorax,Curative (Radical),Education or training,Verification of treatment accessories,,Improved compliance with existing policies or ...


In [3]:
fake.columns

Index(['incident_id', 'event_type', 'incident_description', 'descriptor',
       'coordinator_comments', 'investigation_narrative', 'acute_medical_harm',
       'dosimetric_impact', 'latent_medical_harm', 'functional_work_area',
       'date_incident_detected', 'time_period_detected',
       'date_incident_occurred', 'time_period_occurred', 'individual_detected',
       'individual_involved', 'patient_month_birth', 'patient_year_birth',
       'patient_gender', 'diagnosis', 'process_step_detected',
       'process_step_occurred', 'problem_type', 'contributing_factors',
       'number_patients_affected', 'radiation_treatment_techniques',
       'total_dose_prescribed', 'number_fractions_prescribed',
       'number_fractions_incorrect', 'hardware_manufacturer_model',
       'software_manufacturer_model', 'body_region_treated',
       'treatment_intent', 'ameliorating_actions', 'safety_barriers_failed',
       'safety_barriers_prevented', 'actions_reduce_risk'],
      dtype='object')

In [4]:
description = fake['incident_description']
description

0    Patient was scheduled for initial treatment fr...
1    A patient was to be treated for cord compressi...
2    Imaging guidelines were not respected: (1) KV/...
3    CT sim setup sheet was missing information per...
4    During chart QA, it was noticed that a 0.3cm b...
5    During physics chart check it was noticed that...
6    Called patient to give appointment for January...
7    No stent was indicated on the MRI consent form...
8    Patient booked for iso/rx, but treatment cance...
9          Patient was treated with their dentures in.
Name: incident_description, dtype: object

In [5]:
def avg_wc(column):
    return numpy.mean([len(event.split()) for event in column])

avg_wc(description)

33.200000000000003

In [6]:
def tag_all(column):
    return nltk.pos_tag([word for sentence in column for word in sentence.split()])

tag_all(description)[:10]

[('Patient', 'NNP'),
 ('was', 'VBD'),
 ('scheduled', 'VBN'),
 ('for', 'IN'),
 ('initial', 'JJ'),
 ('treatment', 'NN'),
 ('fraction', 'NN'),
 ('at', 'IN'),
 ('3:00', 'CD'),
 ('pm.', 'NNS')]

In [9]:
def get_tag(column, tag_list):
    return [word for (word, tag) in tag_all(column) if tag in tag_list]

get_tag(description, ['NN', 'NNS'])[:10]

['treatment',
 'fraction',
 'pm.',
 'plan',
 'Patients',
 'patient',
 'cord',
 'compression',
 'mask.',
 'day']

In [8]:
def composition(column, tag_list):
    return len(get_tag(column, tag_list))/len([word for sentence in column for word in sentence.split()])

composition(description, ['NN', 'NNS'])

0.2319277108433735

In [14]:
get_tag(description, ['VBS', 'VB'])

['delay.', 'be', 'be', 'be', 'check', 'scan', 'mid', 'give', 'make', 'have']

In [16]:
composition(description, ['VBS', 'VB'])

0.030120481927710843

In [45]:
def txt(cl, nm):
    f = open(nm,'w') 
    for row in fake['incident_description']:
        f.write(row+' ')
    f.close()
txt(fake['incident_description'], 'f.txt')

In [48]:
def nltk_ob(nm):
    return nltk.text.Text(nltk.corpus.gutenberg.words(nm))

des = nltk_ob('f.txt')
des

<Text: Patient was scheduled for initial treatment fraction at...>

In [58]:
def freq(ob):
    return nltk.FreqDist(ob).most_common()

freq(des)[:10]

[('.', 25),
 ('was', 21),
 ('the', 17),
 ('to', 12),
 (',', 11),
 ('a', 9),
 ('for', 8),
 ('patient', 7),
 ('and', 5),
 ('indicated', 5)]

In [69]:
def bigram(ob):
    return list(nltk.bigrams(ob))

bigram(des)[:10]

[('Patient', 'was'),
 ('was', 'scheduled'),
 ('scheduled', 'for'),
 ('for', 'initial'),
 ('initial', 'treatment'),
 ('treatment', 'fraction'),
 ('fraction', 'at'),
 ('at', '3'),
 ('3', ':'),
 (':', '00')]