# Data Vectorization

In [1]:
import csv
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def return_tfidf_matrix(column, mx, mn, ng):
    tfidf = TfidfVectorizer(max_df=mx, min_df=mn, ngram_range=ng)
    X = tfidf.fit_transform(column)
    X = pandas.DataFrame.from_records(X.todense().tolist(), columns=tfidf.get_feature_names())
    return X

In [3]:
def return_one_hot(column, penalty):
    option_set = list(set(list(column)))
    return pandas.DataFrame([[1 if value == option else penalty for option in option_set] for value in column], columns = option_set)

### NM PS

In [4]:
dropna_ps = pandas.read_csv('06 NM PS.csv', delimiter=',', encoding='latin-1').fillna('')
dropna_ps[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribed bolus bolus prescribe md pt rec...


In [5]:
dropna_ps_tfidf = return_tfidf_matrix(dropna_ps['Bony Cleaned'], 1000, 5, (1,3))
dropna_ps_tfidf.to_csv('07 NM PS TFIDF.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_ps_tfidf[:3]

Unnamed: 0,_____________________,_____________________ follow,able,acceptable,access,accessory,accessory require,accidentally,accord,account,...,wrong ct,wrong info,wrong info mri,wrong patient,wrong pt,wrong set,wrong tattoo,xrt,xrt date,xrt pt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Since process steps are mutually exclusive, we set penalty to -1.
dropna_ps_ohe = return_one_hot(dropna_ps['Process Step'], 0)
dropna_ps_ohe.to_csv('07 NM PS OHE.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_ps_ohe[:3]

Unnamed: 0,Post-treatment completion,Treatment delivery,Interventional procedure for planning and/or delivery,Contouring and planning,Patient medical consultation and physician assessment,Imaging for treatment planning,Radiation treatment prescription scheduling,On-treatment quality assurance,Pre-treatment quality assurance
0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1


### NM PT

In [7]:
dropna_pt = pandas.read_csv('06 NM PT.csv', delimiter=',', encoding='latin-1').fillna('')
dropna_pt[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribed bolus bolus prescribe md pt rec...


In [8]:
dropna_pt_tfidf = return_tfidf_matrix(dropna_pt['Bony Cleaned'], 1000, 5, (1,3))
dropna_pt_tfidf.to_csv('07 NM PT TFIDF.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_pt_tfidf[:3]

Unnamed: 0,____________________,_____________________,_____________________ follow,able,acceptable,access,accessory,accessory require,accidentally,accommodate,...,work,would,write,writer,wrong,wrong patient,wrong tattoo,xrt,xrt date,xrt pt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Since process steps are mutually exclusive, we set penalty to -1.
dropna_pt_ohe = return_one_hot(dropna_pt['Problem Type'], 0)
dropna_pt_ohe.to_csv('07 NM PT OHE.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_pt_ohe[:3]

Unnamed: 0,Untimely access to medical care or radiotherapy,Treatment not delivered - personnel/hardware/software failure,Wrong prescription dose-fractionation or calculation error,Wrong patient,Allergic reaction,Wrong side (laterality),Inadequate coordination of combined modality care,Infection,Interventional procedure error (Retired value),Wrong plan dose (Retired value),...,Treatment plan acceptable but not physically deliverable,"Wrong patient position, setup point, or shift",Wrong target or OAR contours,Fall or other patient injury or medical condition,Treatment plan (isodose distribution) unacceptable,Radiation therapy scheduling error,"Wrong, missing, mislabeled, or damaged treatment accessories",Excess imaging dose,Wrong planning margins,Failure to perform on-treatment imaging as per instructions
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### NM CF

In [10]:
dropna_cf = pandas.read_csv('06 NM CF.csv', delimiter=',', encoding='latin-1').fillna('')
dropna_cf[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribed bolus bolus prescribe md pt rec...


In [11]:
dropna_cf_tfidf = return_tfidf_matrix(dropna_cf['Bony Cleaned'], 1000, 5, (1,3))
dropna_cf_tfidf.to_csv('07 NM CF TFIDF.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_cf_tfidf[:3]

Unnamed: 0,____________________,_____________________,_____________________ follow,able,acceptable,access,accessory,accessory require,accidentally,accommodate,...,wrong ct,wrong info,wrong info mri,wrong patient,wrong pt,wrong set,wrong tattoo,xrt,xrt date,xrt pt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
column = dropna_cf['Contributing Factors']
option_set = set([cf for cfs in column for cf in cfs.split('|')])
dropna_cf_ohe = pandas.DataFrame([[1 if option in value else 0 for option in option_set] for value in column], columns = option_set)
dropna_cf_ohe.to_csv('07 NM CF OHE.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_cf_ohe[:3]

Unnamed: 0,Unnamed: 1,Communication or documentation inadequate (patient specific),Failure to identify potential risks,Equipment quality assurance and/or maintenance inadequate,"Equipment software or hardware design, including 'human factors' design, inadequate",Staff behaviour,"Equipment software or hardware commissioning, calibration or acceptance testing inadequate",Handoffs inadequate,Distraction or diversions involving staff,Policies and/or procedures non-existent or inadequate,...,"Patient or family member medical condition, preference or behaviour",Staff education or training inadequate,Patient or family member medical condition preference or behaviour,Change management,Organizational and/or workspace resources inadequate (excluding human resources),Policies and/or procedures not followed,Patient education inadequate,Unfamiliar treatment approach or radiation treatment technique,Expectation bias involving staff,Human resources inadequate
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### NM OS

In [13]:
dropna_os = pandas.read_csv('06 NM OS.csv', delimiter=',', encoding='latin-1').fillna('')
dropna_os[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribed bolus bolus prescribe md pt rec...


In [14]:
dropna_os_tfidf = return_tfidf_matrix(dropna_cf['Bony Cleaned'], 1000, 5, (1,3))
dropna_os_tfidf.to_csv('07 NM OS TFIDF.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_os_tfidf[:3]

Unnamed: 0,____________________,_____________________,_____________________ follow,able,acceptable,access,accessory,accessory require,accidentally,accommodate,...,wrong ct,wrong info,wrong info mri,wrong patient,wrong pt,wrong set,wrong tattoo,xrt,xrt date,xrt pt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
column = dropna_cf['Overall Severity']
option_set = set([cf for cfs in column for cf in cfs.split('|')])
dropna_cf_ohe = pandas.DataFrame([[1 if option in value else 0 for option in option_set] for value in column], columns = option_set)
dropna_cf_ohe.to_csv('07 NM OS OHE.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)
dropna_cf_ohe[:3]

Unnamed: 0,Unnamed: 1,None,Moderate,Severe,Mild
0,1,1,0,0,0
1,1,0,0,0,1
2,1,0,0,0,0
