# Analysing the training data:

The training data was taken from the CT-RATE reports.
We can create synthetic data by finding which items we can change.

We want a set of items that is just error-free. This must be analysed to check whether they are error-free or not, or maybe modified such that they are error-free. 

We also want 4 datasets that contain errors. These errors are internal inconsistencies, extraneous statements, transcription errors and omissions.

In [70]:
import pandas as pd
from pprint import pprint
import numpy as np
import nltk

df = pd.read_csv("datasets/training_data1.csv")

itemsToChange = df["Correct Items"]

print(f"Current number of data to change: {len(itemsToChange)}")

RANDOM_SEED = 42

Current number of data to change: 23695


In [63]:
# Define what words can be used to find errors:

internalInconsistency = [
    ["anterior", "posterior"],
    ["medial", "lateral"],
    ["superior", "inferior"],
    [
        "anterolateral",
        "posterolateral",
        "supralateral",
        "infralateral",
        "anterosuperior",
        "posterosuperior",
        "anteroposterior",
    ],
    [
        "anteromedial",
        "posteromedial",
        "supramedial",
        "inframedial",
        "anteroinferior",
        "posteroinferior",
        "posteroanterior",
    ],
    [
        "anterior-lateral",
        "posterior-lateral",
        "superior-lateral",
        "inferior-lateral",
        "anterior-superior",
        "posterior-superior",
    ],
    [
        "anterior-medial",
        "posterior-medial",
        "superior-medial",
        "inferior-medial",
        "anterior-inferior",
        "posterior-inferior",
    ],
    ["anterior-posterior", "medial-lateral", "superior-inferior"],
    ["dorsal-ventral", "transverse", "craniocaudal"],
    ["cranial", "caudal"],
    ["hepatopedal", "hepatofugal"],
    ["dorsal", "ventral"],
    ["proximal", "distal"],
    ["long axis", "short axis"],
    ["peripheral", "central"],
    ["superficial", "deep"],
    ["metaphysis", "diaphysis", "epiphysis"],
    ["ascending", "descending"],
    ["increase", "decrease"],
    ["increased", "decreased"],
    ["basal", "apical"],
    ["hyperdense", "hypodense"],
    ["solid", "cystic"],
    ["dependent", "non-dependent"],
    ["upper", "lower"],
]

transcription = [
    ["abscess", "access", "assess"],
    ["achalasia", "atelectasis", "epistaxis"],
    ["adrenal", "renal"],
    ["alveolar", "valcular", "lobular", "tubular"],
    ["aneurysm", "anaplasia", "anemia"],
    ["anterolisthesis", "retrolisthesis", "spondylolisthesis"],
    ["ascites", "cystitis", "bursitis", "colitic"],
    ["aspiration", "eventration"],
    ["atheroma", "myxoma", "osteoma", "lipoma"],
    ["borderline", "baseline"],
    ["bronchiectasis", "bronchitis", "bronchiolitis", "bronchi"],
    ["bronchogenic", "bronchiolitic", "bronchoscopic"],
    ["bullous", "mucous"],
    ["calcified", "ossified", "classified"],
    ["carcinomatosis,sarcomatosis", "carcinosis", "sarcoidosis"],
    ["cm", "mm", "m"],
    ["consolidation", "accumulation", "congestion", "compaction", "obstruction"],
    ["consolidative", "accumulative", "congestive", "obstructive"],
    ["coronary", "coronal", "coronoid", "coracoid", "corneal"],
    ["corpuscles", "corpus", "corvus", "corpse"],
    ["cortical", "corticoid", "corticate"],
    ["craniocaudal", "craniocervical", "craniobasal"],
    ["cyst", "gist", "list", "fist"],
    ["cystic", "systolic", "caustic", "cyclic", "plastic"],
    ["degenerative", "regenerative", "destructive"],
    ["diaphragm", "diagram", "diaphysis"],
    ["edematous", "erythematous", "emphysematous"],
    ["effusion", "confusion", "diffusion", "perfusion", "occlusion"],
    ["empyema", "emphysema", "haematoma", "endothelium"],
    ["endobronchial", "endotracheal"],
    ["esophagogastric", "esophagocolic"],
    ["esophagus", "esophagitis"],
    [
        "fibrosis",
        "stenosis",
        "sclerosis",
        "synostosis",
        "cyanosis",
        "thrombosis",
        "necrosis",
        "nephrosis",
        "silicosis",
        "cirrhosis",
        "asbestosis",
        "aspergillosis",
        "kyphosis",
        "lordosis",
        "mycosis",
    ],
    [
        "fibrotic",
        "stenotic",
        "sclerotic",
        "cyanotic",
        "thrombotic",
        "necrotic",
        "nephrotic",
        "cirrhotic",
    ],
    ["fissure", "fixture", "fisher", "fistula", "fossa"],
    ["fluid", "flutter", "fluctuant", "florid"],
    ["fracture", "friction", "contracture", "rapture"],
    ["fusiform", "reniform"],
    ["gastroesophageal", "gastroduodenal", "gastrojejunal", "gastroepiploic"],
    ["ground glass/ground-glass", "ground grass", "brown glass", "brown brass"],
    ["hemorrhagic", "hemostatic", "hemolytic"],
    ["hernia", "fistula", "myalgia"],
    ["herniation", "fistulation"],
    ["hilar", "hyoid", "hilum"],
    [
        "hypertension",
        "hypotension",
        "hyperextension",
        "hyperattenuation",
        "hypoattenuation",
    ],
    ["indeterminant", "intermittent"],
    ["inflammatory", "informatory", "inspiratory"],
    ["intrapulmonary", "intraperitoneal", "intramedullary", "intravascular"],
    ["lobular", "lobar"],
    ["lymphangitis", "pancreatitis", "adenitis"],
    ["lymphatic", "hepatic"],
    ["marrow", "narrow", "macro", "micro"],
    ["medullary", "modular"],
    ["metastasis", "metaphysis", "metanalysis", "metastases"],
    ["metastatic", "metaplastic", "myoclonic", "metabolic", "hyperplastic"],
    ["millimetric", "metric"],
    ["myocardial", "myocardium", "endocardial", "endocardium", "pericardium"],
    ["nodule", "module", "tuber"],
    ["non-specific", "non-systemic", "non-selective"],
    ["occlusive", "conclusive", "inclusive"],
    ["osteopenia", "sarcopenia"],
    ["osteopenic", "osteoporotic", "osteolytic", "osteopathic"],
    ["paratracheal", "paraoesophageal", "parabronchial", "pericardial"],
    ["parenchyma", "pneumonia"],
    ["pathological", "physiological", "psychological"],
    [
        "pericarditis",
        "endocarditis",
        "pleuritis",
        "perichondritis",
        "peritonitis",
        "pneumonitis",
    ],
    ["perivascular", "perihilar", "peribronchial"],
    ["plaque", "black", "plug"],
    ["pneumothorax", "hemothorax"],
    ["portal", "total", "pedal"],
    ["previous", "pervious", "pylorus"],
    ["pulmonary", "voluntary"],
    ["reticular", "auricular", "trabecular", "vesicular", "articular"],
    [
        "reticulation",
        "recirculation",
        "recalculation",
        "regulation",
        "strangulation",
        "ventilation",
        "speculation",
        "stipulation",
    ],
    ["retropulsion", "retroversion", "retroflexion", "reflexion", "expulsion"],
    ["sequela", "sclera", "stella"],
    ["sequelae", "sequestrae"],
    ["significant", "malignant", "magnificant", "consistent"],
    ["subphrenic", "subpleural", "subhepatic", "subdural"],
    ["suspicious", "surreptitious"],
    ["traction", "fraction", "action", "contraction", "reaction"],
    ["vascular", "valvular", "muscular", "vestibular", "molecular"],
    ["vocal", "focal", "vagal", "local"],
    ["lymphadenopathy", "adenopathy", "radiculopathy"],
    ["pleurodesis", "pleurocentesis"],
]

omission = [
    "no",
    "cannot",
    "clear",
    "clearly",
    "exclude",
    "excluded",
    "increase",
    "decrease",
    "significant",
    "more",
    "greater",
    "less",
]

extraneous = [
    "the total",
    "quina",
    "management",
    "office",
    "staircase",
    "hesitation",
    "umbrella",
    "keyboard",
    "carriage",
]

# Convert side confusion and near_homonym into dictionaries.
sideConfusionDict = {}

for mistakeWords in internalInconsistency:
    for word in mistakeWords:
        # Create a set from the mistake words
        mistakeSet = set(mistakeWords)
        wordSet = {word}
        # print(f"Current word = {wordSet} - {mistakeSet - wordSet}")
        sideConfusionDict[word] = mistakeSet - wordSet

nearHomonymDict = {}

for homonyms in transcription:
    #   print(homonyms)
    for currentHomonym in homonyms:
        # print(currentHomonym)
        closeHomonymSet = set(homonyms)
        # print(f"Current word = {currentHomonym} - {closeHomonymSet - {currentHomonym}}")
        nearHomonymDict[currentHomonym] = closeHomonymSet - {currentHomonym}

# pprint(sideConfusionDict)
# pprint(nearHomonymDict)

In [68]:
# Look through the data and check whether we can add a sentence based on a containing word


THRESHOLD = 250


# Split a sentence by whitespace.
def containingError(sent: str, itemSet: set) -> bool:
    """Boolean function that checks whether there is an intersection between key words of errors - used as a Boolean mask"""
    sentList = set(sent.split())
    if set(itemSet).intersection(sentList):
        return True
    return False
    # Append to a set.


los = [
    ("Omission", omission),
    ("Internal Inconsistency", set(sideConfusionDict.keys())),
    ("Transcription Error", set(nearHomonymDict.keys())),
]

for name, itemSet in los:
    filt = itemsToChange.apply(lambda row: containingError(row, itemSet))
    filteredDF = itemsToChange[filt].sample(THRESHOLD, random_state= RANDOM_SEED)
    print(f"Filtered by {name}:\n", filteredDF)
    # Remove items from itemsToChange
    itemsToChange.drop(filteredDF.index, inplace=True)
    filteredDF = filteredDF.to_frame("Original")
    filteredDF["Changed"] = np.nan
    filteredDF["ErrorType"] = np.nan
    filteredDF["ErrorExplanation"] = np.nan
    filteredDF["ErrorPhrases"] = np.nan
    filteredDF.to_csv(f"datasets/training_{name}.csv")



Filtered by Omission:
 4952     Clinical Information:\nCough, sore throat, fev...
4075     Clinical Information:\nNot given.\nTechnique:\...
5388     Clinical Information:\nNot given.\nTechnique:\...
11228    Clinical Information:\nNot given.\nTechnique:\...
2890     Clinical Information:\nNot given.\nTechnique:\...
                               ...                        
11422    Clinical Information:\nNot given.\nTechnique:\...
7193     Clinical Information:\nChronic cough.\nTechniq...
16275    Clinical Information:\nNot given.\nTechnique:\...
6009     Clinical Information:\nHeadache, weakness, mal...
16329    Clinical Information:\nfever, joint pain, mala...
Name: Correct Items, Length: 250, dtype: object
Filtered by Internal Inconsistency:
 6190     Clinical Information:\nNot given.\nTechnique:\...
17638    Clinical Information:\nNot given.\nTechnique:\...
4278     Clinical Information:\nOperated RCC, adrenal m...
3266     Clinical Information:\ncovid.\nTechnique:\nNon...
6445   

In [67]:
toAnnotate = ["errorFree", "Extraneous Statement"]
# These need to be hand-annotated as they may be harder to computationally add.

for error in toAnnotate:
    filteredDF = itemsToChange.sample(THRESHOLD, random_state=RANDOM_SEED)
    print(f"Filtered by {error}:\n", filteredDF)
    filteredDF = filteredDF.to_frame("Original")
    # Remove items from itemsToChange
    itemsToChange.drop(filteredDF.index, inplace=True)
    filteredDF["Changed"] = np.nan
    filteredDF["ErrorType"] = np.nan
    filteredDF["ErrorExplanation"] = np.nan
    filteredDF["ErrorPhrases"] = np.nan
    filteredDF.to_csv(f"datasets/training_{error}.csv")
itemsToChange.shape

Filtered by errorFree:
 5484     Clinical Information:\nNot given.\nTechnique:\...
12692    Clinical Information:\nNot given.\nTechnique:\...
6783     Clinical Information:\npneumonia?.\nTechnique:...
1582     Clinical Information:\nNot given.\nTechnique:\...
11776    Clinical Information:\nNot given.\nTechnique:\...
                               ...                        
10364    Clinical Information:\nCough, chest pain.\nTec...
8889     Clinical Information:\nNot given.\nTechnique:\...
13119    Clinical Information:\nNot given.\nTechnique:\...
12704    Clinical Information:\nNot given.\nTechnique:\...
15043    Clinical Information:\nHeadache, weakness and ...
Name: Correct Items, Length: 250, dtype: object
Filtered by Extraneous Statement:
 2633     Clinical Information:\nColon Ca, infection in ...
3350     Clinical Information:\nNot given.\nTechnique:\...
899      Clinical Information:\nNot given.\nTechnique:\...
10749    Clinical Information:\ndyspnea\nTechnique:\nNo...
6839    

(21445,)

In [None]:
ii_dataset = pd.read_csv("datasets/training_Internal Inconsistency.csv")

dataToChange = ii_dataset["Original"]

nltk.sent_tokenize(dataToChange[0])

['Clinical Information:\nNot given.', 'Technique:\n1.5 mm thick non-contrast sections were taken in the axial plane.', 'Findings:\nTrachea and lumen of both main bronchi are open.', 'No occlusive pathology was detected in the trachea and lumen of both main bronchi.', 'Mediastinal structures were evaluated as suboptimal since the examination was unenhanced.', 'As far as can be observed: A lymph node of 18x13 mm was observed in the anterior mediastinum.', 'Calibration of thoracic main vascular structures is natural.', 'No dilatation was detected in the thoracic aorta.', 'Heart size increased.', 'Pericardial thickening-effusion was not detected.', 'Thoracic esophagus calibration was normal and no significant pathological wall thickening was detected in the examination borders.', 'No lymph node was detected in mediastinal and bilateral hilar pathological size and appearance.', 'When evaluated in the parenchyma window of both lungs: A mosaic attenuation pattern was observed in both lung par