# Analysing the training data:

The training data was taken from the CT-RATE reports.
We can create synthetic data by finding which items we can change.

We want a set of items that is just error-free. This must be analysed to check whether they are error-free or not, or maybe modified such that they are error-free. 

We also want 4 datasets that contain errors. These errors are internal inconsistencies, extraneous statements, transcription errors and omissions.

In [2]:
import pandas as pd
from pprint import pprint
import numpy as np
import nltk
import random
from schema import RadiologyErrors, RadiologyError, ErrorType

# Detokenises the sentence back together
twd = nltk.TreebankWordDetokenizer()
df = pd.read_csv("datasets/training_data1.csv")

itemsToChange = df["Correct Items"]

print(f"Current number of data to change: {len(itemsToChange)}")

RANDOM_SEED = 42
random.seed(RANDOM_SEED)


Current number of data to change: 23695


In [3]:
# Define what words can be used to find errors:

internalInconsistency = [
    ["anterior", "posterior"],
    ["medial", "lateral"],
    ["superior", "inferior"],
    [
        "anterolateral",
        "posterolateral",
        "supralateral",
        "infralateral",
        "anterosuperior",
        "posterosuperior",
        "anteroposterior",
    ],
    [
        "anteromedial",
        "posteromedial",
        "supramedial",
        "inframedial",
        "anteroinferior",
        "posteroinferior",
        "posteroanterior",
    ],
    [
        "anterior-lateral",
        "posterior-lateral",
        "superior-lateral",
        "inferior-lateral",
        "anterior-superior",
        "posterior-superior",
    ],
    [
        "anterior-medial",
        "posterior-medial",
        "superior-medial",
        "inferior-medial",
        "anterior-inferior",
        "posterior-inferior",
    ],
    ["anterior-posterior", "medial-lateral", "superior-inferior"],
    ["dorsal-ventral", "transverse", "craniocaudal"],
    ["cranial", "caudal"],
    ["hepatopedal", "hepatofugal"],
    ["dorsal", "ventral"],
    ["proximal", "distal"],
    ["long axis", "short axis"],
    ["peripheral", "central"],
    ["superficial", "deep"],
    ["metaphysis", "diaphysis", "epiphysis"],
    ["ascending", "descending"],
    ["increase", "decrease"],
    ["increased", "decreased"],
    ["basal", "apical"],
    ["hyperdense", "hypodense"],
    ["solid", "cystic"],
    ["dependent", "non-dependent"],
    ["upper", "lower"],
]

transcription = [
    ["abscess", "access", "assess"],
    ["achalasia", "atelectasis", "epistaxis"],
    ["adrenal", "renal"],
    ["alveolar", "valcular", "lobular", "tubular"],
    ["aneurysm", "anaplasia", "anemia"],
    ["anterolisthesis", "retrolisthesis", "spondylolisthesis"],
    ["ascites", "cystitis", "bursitis", "colitic"],
    ["aspiration", "eventration"],
    ["atheroma", "myxoma", "osteoma", "lipoma"],
    ["borderline", "baseline"],
    ["bronchiectasis", "bronchitis", "bronchiolitis", "bronchi"],
    ["bronchogenic", "bronchiolitic", "bronchoscopic"],
    ["bullous", "mucous"],
    ["calcified", "ossified", "classified"],
    ["carcinomatosis,sarcomatosis", "carcinosis", "sarcoidosis"],
    ["cm", "mm", "m"],
    ["consolidation", "accumulation", "congestion", "compaction", "obstruction"],
    ["consolidative", "accumulative", "congestive", "obstructive"],
    ["coronary", "coronal", "coronoid", "coracoid", "corneal"],
    ["corpuscles", "corpus", "corvus", "corpse"],
    ["cortical", "corticoid", "corticate"],
    ["craniocaudal", "craniocervical", "craniobasal"],
    ["cyst", "gist", "list", "fist"],
    ["cystic", "systolic", "caustic", "cyclic", "plastic"],
    ["degenerative", "regenerative", "destructive"],
    ["diaphragm", "diagram", "diaphysis"],
    ["edematous", "erythematous", "emphysematous"],
    ["effusion", "confusion", "diffusion", "perfusion", "occlusion"],
    ["empyema", "emphysema", "haematoma", "endothelium"],
    ["endobronchial", "endotracheal"],
    ["esophagogastric", "esophagocolic"],
    ["esophagus", "esophagitis"],
    [
        "fibrosis",
        "stenosis",
        "sclerosis",
        "synostosis",
        "cyanosis",
        "thrombosis",
        "necrosis",
        "nephrosis",
        "silicosis",
        "cirrhosis",
        "asbestosis",
        "aspergillosis",
        "kyphosis",
        "lordosis",
        "mycosis",
    ],
    [
        "fibrotic",
        "stenotic",
        "sclerotic",
        "cyanotic",
        "thrombotic",
        "necrotic",
        "nephrotic",
        "cirrhotic",
    ],
    ["fissure", "fixture", "fisher", "fistula", "fossa"],
    ["fluid", "flutter", "fluctuant", "florid"],
    ["fracture", "friction", "contracture", "rapture"],
    ["fusiform", "reniform"],
    ["gastroesophageal", "gastroduodenal", "gastrojejunal", "gastroepiploic"],
    ["ground glass/ground-glass", "ground grass", "brown glass", "brown brass"],
    ["hemorrhagic", "hemostatic", "hemolytic"],
    ["hernia", "fistula", "myalgia"],
    ["herniation", "fistulation"],
    ["hilar", "hyoid", "hilum"],
    [
        "hypertension",
        "hypotension",
        "hyperextension",
        "hyperattenuation",
        "hypoattenuation",
    ],
    ["indeterminant", "intermittent"],
    ["inflammatory", "informatory", "inspiratory"],
    ["intrapulmonary", "intraperitoneal", "intramedullary", "intravascular"],
    ["lobular", "lobar"],
    ["lymphangitis", "pancreatitis", "adenitis"],
    ["lymphatic", "hepatic"],
    ["marrow", "narrow", "macro", "micro"],
    ["medullary", "modular"],
    ["metastasis", "metaphysis", "metanalysis", "metastases"],
    ["metastatic", "metaplastic", "myoclonic", "metabolic", "hyperplastic"],
    ["millimetric", "metric"],
    ["myocardial", "myocardium", "endocardial", "endocardium", "pericardium"],
    ["nodule", "module", "tuber"],
    ["non-specific", "non-systemic", "non-selective"],
    ["occlusive", "conclusive", "inclusive"],
    ["osteopenia", "sarcopenia"],
    ["osteopenic", "osteoporotic", "osteolytic", "osteopathic"],
    ["paratracheal", "paraoesophageal", "parabronchial", "pericardial"],
    ["parenchyma", "pneumonia"],
    ["pathological", "physiological", "psychological"],
    [
        "pericarditis",
        "endocarditis",
        "pleuritis",
        "perichondritis",
        "peritonitis",
        "pneumonitis",
    ],
    ["perivascular", "perihilar", "peribronchial"],
    ["plaque", "black", "plug"],
    ["pneumothorax", "hemothorax"],
    ["portal", "total", "pedal"],
    ["previous", "pervious", "pylorus"],
    ["pulmonary", "voluntary"],
    ["reticular", "auricular", "trabecular", "vesicular", "articular"],
    [
        "reticulation",
        "recirculation",
        "recalculation",
        "regulation",
        "strangulation",
        "ventilation",
        "speculation",
        "stipulation",
    ],
    ["retropulsion", "retroversion", "retroflexion", "reflexion", "expulsion"],
    ["sequela", "sclera", "stella"],
    ["sequelae", "sequestrae"],
    ["significant", "malignant", "magnificant", "consistent"],
    ["subphrenic", "subpleural", "subhepatic", "subdural"],
    ["suspicious", "surreptitious"],
    ["traction", "fraction", "action", "contraction", "reaction"],
    ["vascular", "valvular", "muscular", "vestibular", "molecular"],
    ["vocal", "focal", "vagal", "local"],
    ["lymphadenopathy", "adenopathy", "radiculopathy"],
    ["pleurodesis", "pleurocentesis"],
]

omission = [
    "no",
    "cannot",
    "clear",
    "clearly",
    "exclude",
    "excluded",
    "increase",
    "decrease",
    "significant",
    "more",
    "greater",
    "less",
]

extraneous = [
    "the total",
    "quina",
    "management",
    "office",
    "staircase",
    "hesitation",
    "umbrella",
    "keyboard",
    "carriage",
]

# Convert side confusion and near_homonym into dictionaries.
sideConfusionDict = {}

for mistakeWords in internalInconsistency:
    for word in mistakeWords:
        # Create a set from the mistake words
        mistakeSet = set(mistakeWords)
        wordSet = {word}
        # print(f"Current word = {wordSet} - {mistakeSet - wordSet}")
        sideConfusionDict[word] = mistakeSet - wordSet

nearHomonymDict = {}

for homonyms in transcription:
    #   print(homonyms)
    for currentHomonym in homonyms:
        # print(currentHomonym)
        closeHomonymSet = set(homonyms)
        # print(f"Current word = {currentHomonym} - {closeHomonymSet - {currentHomonym}}")
        nearHomonymDict[currentHomonym] = closeHomonymSet - {currentHomonym}

# pprint(sideConfusionDict)
# pprint(nearHomonymDict)

In [4]:
# Look through the data and check whether we can add a sentence based on a containing word


THRESHOLD = 250


# Split a sentence by whitespace.
def containingError(sent: str, itemSet: set) -> bool:
    """Boolean function that checks whether there is an intersection between key words of errors - used as a Boolean mask"""
    sentList = set(sent.split())
    if set(itemSet).intersection(sentList):
        return True
    return False
    # Append to a set.


los = [
    ("Omission", omission),
    ("Internal Inconsistency", set(sideConfusionDict.keys())),
    ("Transcription Error", set(nearHomonymDict.keys())),
]

for name, itemSet in los:
    filt = itemsToChange.apply(lambda row: containingError(row, itemSet))
    filteredDF = itemsToChange[filt].sample(THRESHOLD, random_state=RANDOM_SEED)
    print(f"Filtered by {name}:\n", filteredDF)
    # Remove items from itemsToChange
    itemsToChange.drop(filteredDF.index, inplace=True)
    filteredDF = filteredDF.to_frame("Original")
    filteredDF["Changed"] = np.nan
    filteredDF["ErrorType"] = np.nan
    filteredDF["ErrorExplanation"] = np.nan
    filteredDF["ErrorPhrases"] = np.nan
    filteredDF.to_csv(f"datasets/training_{name}.csv")


Filtered by Omission:
 14543    Clinical Information:\npneumonia ?\nTechnique:...
2447     Clinical Information:\npneumonia\nTechnique:\n...
19135    Clinical Information:\nSick pneumonia compatib...
2839     Clinical Information:\nbronchiectasis\nTechniq...
8296     Clinical Information:\nWeakness, chills, shive...
                               ...                        
21290    Clinical Information:\nNot given.\nTechnique:\...
3230     Clinical Information:\nNot given.\nTechnique:\...
10758    Clinical Information:\nNot given.\nTechnique:\...
3531     Clinical Information:\nNot given.\nTechnique:\...
5048     Clinical Information:\nNot given.\nTechnique:\...
Name: Correct Items, Length: 250, dtype: object
Filtered by Internal Inconsistency:
 6121     Clinical Information:\nFall\nTechnique:\nNon-c...
14699    Clinical Information:\nNot given.\nTechnique:\...
3329     Clinical Information:\nNot given.\nTechnique:\...
21598    Clinical Information:\nBack pain\nTechnique:\n...
6859   

In [5]:
toAnnotate = ["errorFree", "Extraneous Statement"]
# These need to be hand-annotated as they may be harder to computationally add.

for error in toAnnotate:
    filteredDF = itemsToChange.sample(THRESHOLD, random_state=RANDOM_SEED)
    print(f"Filtered by {error}:\n", filteredDF)
    filteredDF = filteredDF.to_frame("Original")
    # Remove items from itemsToChange
    itemsToChange.drop(filteredDF.index, inplace=True)
    filteredDF["Changed"] = np.nan
    filteredDF["ErrorType"] = np.nan
    filteredDF["ErrorExplanation"] = np.nan
    filteredDF["ErrorPhrases"] = np.nan
    filteredDF.to_csv(f"datasets/training_{error}.csv")
itemsToChange.shape

Filtered by errorFree:
 19363    Clinical Information:\nNon hodgkin lymphoma\nT...
12988    Clinical Information:\npneumonia?\nTechnique:\...
2441     Clinical Information:\nNot given.\nTechnique:\...
20182    Clinical Information:\npneumonia?\nTechnique:\...
16756    Clinical Information:\nNot given.\nTechnique:\...
                               ...                        
2045     Clinical Information:\nCovid parenchyma involv...
19255    Clinical Information:\ndyspnea\nTechnique:\nNo...
19821    Clinical Information:\nNot given.\nTechnique:\...
6371     Clinical Information:\nCovid-19 pneumonia?\nTe...
17875    Clinical Information:\nAML, evaluation before ...
Name: Correct Items, Length: 250, dtype: object
Filtered by Extraneous Statement:
 20731    Clinical Information:\nNot given.\nTechnique:\...
6666     Clinical Information:\nNot given.\nTechnique:\...
15172    Clinical Information:\nCovid-19 pneumonia\nTec...
21269    Clinical Information:\nNot given.\nTechnique:\...
2335    

(22445,)

In [8]:
class ErrorInjector:
    """A class to perform synthetic error injection."""

    def __init__(self, errorType: ErrorType):
        self.errorType = errorType
        if self.errorType == ErrorType.InternalInconsistency:
            self.wordSet = set(sideConfusionDict.keys())
            self.wordDict = sideConfusionDict
        elif self.errorType == ErrorType.TranscriptionError:
            self.wordSet = set(nearHomonymDict.keys())
            self.wordDict = nearHomonymDict
        elif self.errorType == ErrorType.Omission:
            self.wordSet = set(omission)
        else:
            self.wordSet = None
            self.wordDict = None

    def reportCorrection(
        self, report: str
    ) -> tuple[ErrorType, list[str], list[str], str]:
        """Go through each sentence and check if there is a keyword, by turning it into a set. Use the intersection between the set + the sentence. Check whether each sentence in the dataset is correct."""
        tokenised = nltk.sent_tokenize(report)
        for index, sentence in enumerate(tokenised):
            intersect = self.wordSet.intersection(set(sentence.split(" ")))
            if len(intersect) > 0:
                # Take the first element of the sentence and use the replace function to find the replacement word.
                word = random.choice(list(intersect))
                if self.errorType != ErrorType.Omission:
                    # If the error type has word lists then choose a random thing.
                    replacementList = list(self.wordDict[word])
                    replace = random.choice(replacementList)
                    newSentence = sentence.replace(word, replace, 1)
                else:
                    newSentence = sentence.replace(word, "", 1)
                # print(f"Old sentence = {sentence}\nNew sentence = {newSentence}")
                tokenised[index] = newSentence
                break
        if self.errorType == ErrorType.Omission:
            reason = f"The word {word} was omitted."
        elif self.errorType == ErrorType.InternalInconsistency:
            reason = (
                f"There is an internal inconsistency, as {replace} should be {word}."
            )
        elif self.errorType == ErrorType.TranscriptionError:
            reason = f"There is a transcription error; {replace} should be written as {word}."
        else:
            raise Exception("There was an error in the ErrorType inputted.")
        newError = RadiologyError(
            errorType=self.errorType,
            errorPhrases=[newSentence],
            errorExplanation=[reason],
        )
        return (
            newError.errorType,
            newError.errorPhrases,
            newError.errorExplanation,
            twd.detokenize(tokenised),
        )

In [None]:
def addError(path: str, errorType: ErrorType):
    """Add the errors into the csv."""
    errorDataset = pd.read_csv(path)

    dataToChange = errorDataset["Original"]

    iiError = ErrorInjector(errorType=errorType)
    temp = dataToChange.apply(lambda s: iiError.reportCorrection(s))
    errorDataset["ErrorType"] = temp.apply(lambda e: e[0])
    errorDataset["ErrorPhrases"] = temp.apply(lambda e: e[1])
    errorDataset["ErrorExplanation"] = temp.apply(lambda e: e[2])
    errorDataset["Changed"] = temp.apply(lambda e: e[3])
    print(errorDataset)
    errorDataset.to_csv(path)


addError(
    "datasets/training_Internal Inconsistency.csv", ErrorType.InternalInconsistency
)
addError("datasets/training_Omission.csv", ErrorType.Omission)
addError("datasets/training_Transcription Error.csv", ErrorType.TranscriptionError)


     Unnamed: 0.1  Unnamed: 0  \
0               0        6121   
1               1       14699   
2               2        3329   
3               3       21598   
4               4        6859   
..            ...         ...   
245           245        6490   
246           246       14928   
247           247        9826   
248           248       14301   
249           249       11060   

                                              Original  \
0    Clinical Information:\nFall\nTechnique:\nNon-c...   
1    Clinical Information:\nNot given.\nTechnique:\...   
2    Clinical Information:\nNot given.\nTechnique:\...   
3    Clinical Information:\nBack pain\nTechnique:\n...   
4    Clinical Information:\nNot given.\nTechnique:\...   
..                                                 ...   
245  Clinical Information:\nNot given.\nTechnique:\...   
246  Clinical Information:\nNot given.\nTechnique:\...   
247  Clinical Information:\nNot given.\nTechnique:\...   
248  Clinical Informati