# Preliminary Analysis

In [1]:
import csv
import pandas
import spacy
import numpy
from collections import Counter
from py4j.java_gateway import JavaGateway

In [2]:
nlp = spacy.load('en')
gateway = JavaGateway()
metamap = gateway.entry_point

In [3]:
combined = pandas.read_csv('05 Combined Cleaned.csv', delimiter=',', encoding='latin-1').fillna('')
combined[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribed bolus bolus prescribe md pt rec...


### Process Step

In [4]:
def data_element_composition(dataframe, column):
    composition = (dataframe.groupby(column)[dataframe.columns[0]].count()/len(dataframe)*100).round(2).sort_values(ascending=False)
    return composition

In [5]:
data_element_composition(combined, 'Process Step')

Process Step
Treatment delivery                                       34.10
                                                         20.28
Contouring and planning                                  18.54
Imaging for treatment planning                           13.39
Patient medical consultation and physician assessment     6.33
Pre-treatment quality assurance                           4.22
Radiation treatment prescription scheduling               1.48
On-treatment quality assurance                            1.06
Interventional procedure for planning and/or delivery     0.33
Post-treatment completion                                 0.26
Name: ID, dtype: float64

In [6]:
nm_ps = combined[combined['Process Step']!=''] 

In [7]:
set(nm_ps['Process Step'])

{'Contouring and planning',
 'Imaging for treatment planning',
 'Interventional procedure for planning and/or delivery',
 'On-treatment quality assurance',
 'Patient medical consultation and physician assessment',
 'Post-treatment completion',
 'Pre-treatment quality assurance',
 'Radiation treatment prescription scheduling',
 'Treatment delivery'}

In [8]:
nm_ps = combined[combined['Process Step']!=''] # nm means no missing
nm_ps.to_csv('06 NM PS.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

### Problem Type

In [9]:
data_element_composition(combined, 'Problem Type')

Problem Type
                                                                  39.05
Wrong patient position, setup point, or shift                     15.77
Radiation therapy scheduling error                                10.95
Wrong, missing, mislabeled, or damaged treatment accessories       9.76
Excess imaging dose                                                3.36
Systematic hardware/software (including dose-volume) error         3.10
Wrong target or OAR contours or wrong planning (Retired Value)     2.51
Wrong prescription dose-fractionation or calculation error         1.91
Failure to perform on-treatment imaging as per instructions        1.65
Inadequate coordination of combined modality care                  1.39
Wrong anatomical site (excluding laterality)                       1.39
Wrong patient                                                      1.32
Wrong side (laterality)                                            1.29
Fall or other patient injury or medical condition  

In [10]:
nm_pt = combined[combined['Problem Type']!='']
nm_pt.to_csv('06 NM PT.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

### Contributing Factors

In [11]:
cf_list = []
for cfs in combined['Contributing Factors']:
    cf_list += cfs.split('|')
cf_df = pandas.DataFrame(cf_list, columns=['Contributing Factors'])
data_element_composition(cf_df, 'Contributing Factors')

Contributing Factors
Policies and/or procedures not followed                                                       19.69
Distraction or diversions involving staff                                                     14.47
Expectation bias involving staff                                                              13.17
Communication or documentation inadequate (patient specific)                                  12.42
Staff behaviour                                                                                9.63
Policies and/or procedures non-existent or inadequate                                          8.09
                                                                                               7.17
Equipment software or hardware design, including 'human factors' design, inadequate            3.89
Failure to identify potential risks                                                            3.83
Patient or family member medical condition, preference or behaviour            

In [12]:
nm_cf = combined[combined['Contributing Factors']!=''] # dropna means dropping not available data
nm_cf.to_csv('06 NM CF.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

### Contributing Factors

In [13]:
data_element_composition(combined, 'Overall Severity')

Overall Severity
            46.90
None        43.54
Mild         8.67
Moderate     0.76
Severe       0.13
Name: ID, dtype: float64

In [14]:
nm_pt = combined[combined['Overall Severity']!='']
nm_pt.to_csv('06 NM OS.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)

### Incident Description

##### 1-gram frequency

In [15]:
one_grams = []
for entry in combined['Bony Cleaned']:
    one_grams += set(entry.split())
one_gram_counter = Counter(one_grams)
', '.join([item[0] + ' ' + str(item[1]) for item in list(one_gram_counter.most_common())])



In [16]:
two_grams = []
for entry in combined['Bony Cleaned']:
    words = entry.split()
    two_grams += set(zip(words, words[1:]))
two_gram_counter = Counter(two_grams)
', '.join([item[0][0] + ' ' + item[0][1] + ' ' + str(item[1]) for item in list(two_gram_counter.most_common())])



In [17]:
three_grams = []
for entry in combined['Bony Cleaned']:
    words = entry.split()
    three_grams += set(zip(zip(words, words[1:]), words[2:]))
three_gram_counter = Counter(three_grams)
', '.join([item[0][0][0] + ' ' + item[0][0][1] + ' ' + item[0][1] + ' ' + str(item[1]) for item in list(three_gram_counter.most_common())])



Optimal thresholds for n-gram inclusion

|     | TF-IDF | Topic modeling |
|-----|--------|----------------|
| Max | 1000   | 600            |
| Min | 5      | 5              |

##### POS tagging

In [18]:
parts_of_speech = ['ADJ', 'ADV', 'NOUN', 'NUM', 'VERB']

In [19]:
columns = ['ADJ', 'ADV', 'NOUN', 'NUM', 'VERB', 'WC', 'SC']

In [20]:
def parts_of_speech_string(string):
    
    # Initialize row
    row = pandas.DataFrame(columns=columns).astype('int32')
    row.loc[0] = [0 for i in range(len(columns))]
    
    # POS count
    doc = nlp(string)
    for token in doc:
        try:
            row.loc[0][token.pos_] += 1
        except Exception:
            pass

    # Word and sentence count
    row.loc[0]['WC'] = int(row[parts_of_speech].sum(axis=1).loc[0])
    row.loc[0]['SC'] = len(list(doc.sents))
    return [float(element) for element in row.loc[0]]

In [21]:
def parts_of_speech_column(column):
    counts = pandas.DataFrame(columns=columns)
    for i in range(len(column)):
        string = column.iloc[i]
        counts.loc[i] = parts_of_speech_string(string)
    return counts

In [22]:
pos_counts = parts_of_speech_column(combined['Neat Cleaned'])
pos_counts[:5]

Unnamed: 0,ADJ,ADV,NOUN,NUM,VERB,WC,SC
0,3.0,1.0,6.0,0.0,3.0,13.0,3.0
1,5.0,2.0,25.0,0.0,15.0,47.0,6.0
2,1.0,3.0,7.0,0.0,6.0,17.0,1.0
3,2.0,1.0,7.0,0.0,2.0,12.0,2.0
4,0.0,1.0,7.0,0.0,2.0,10.0,3.0


In [23]:
def summary(dataframe):
    return dataframe.describe().round(2)[1:]

In [24]:
summary = summary(pos_counts)
summary

Unnamed: 0,ADJ,ADV,NOUN,NUM,VERB,WC,SC
mean,2.94,1.82,11.14,0.18,6.92,23.0,3.07
std,3.86,2.44,13.11,1.0,8.78,26.82,3.1
min,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,1.0,0.0,4.0,0.0,2.0,9.0,1.0
50%,2.0,1.0,7.0,0.0,4.0,15.0,2.0
75%,4.0,2.0,13.0,0.0,8.0,27.0,4.0
max,53.0,26.0,161.0,32.0,98.0,294.0,38.0


In [25]:
summary.to_csv('06 POS Summary.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)