In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import re

In [55]:
import pandas as pd

clinical_data = {
    "Patient ID: 001, Age: 45, Diagnosis: Diabetes",
    "Patient ID: 002, Age: 60, Diagnosis: Hypertension",
    "Patient ID: 003, Age: 29, Diagnosis: Diabetes",
    "Patient ID: 004, Age: 40, Diagnosis: BloodPressure",
    "Patient ID: 005, Age: 52, Diagnosis: Depression"
}
#df = pd.DataFrame(clinical_data)
#print(df)

In [56]:
#Rule-Based Approach
def rule_based_extraction(data):
    return [re.findall(r'Patient ID: (\d+), Age: (\d+), Diagnosis: (\w+)', record) for record in data]


In [57]:
#Patterns-based approach using text patters
def pattern_based_extraction(data):
    patterns = ["Patient ID", "Age", "Diagnosis"]
    extracted_data = []
    for record in data:
        record_data = []
        for pattern in patterns:
            match = re.search(f'{pattern}: (\w+)', record)
            record_data.append(match.group(1) if match else None)
        extracted_data.append(tuple(record_data))
    return extracted_data

In [58]:
#Machine Learning approach using Naive Bayes
def machine_learning_extraction(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    y = [record.split(", ")[2].split(": ")[1] for record in data]
    clf = MultinomialNB().fit(X, y)
    predictions = clf.predict(X)
    return list(zip([record.split(", ")[0].split(": ")[1] for record in data], y, predictions))

In [59]:
#Extracting structured data using different approaches
rule_based_result = rule_based_extraction(clinical_data)
pattern_based_result = pattern_based_extraction(clinical_data)
machinelearning_based_result = machine_learning_extraction(clinical_data)

In [60]:
#Displaying the result
print("Rule-based Extraction:", rule_based_result)
print("Pattern-based Extraction:", pattern_based_result)
print("MachineLearning-based Extraction:", machinelearning_based_result)

Rule-based Extraction: [[('004', '40', 'BloodPressure')], [('001', '45', 'Diabetes')], [('003', '29', 'Diabetes')], [('002', '60', 'Hypertension')], [('005', '52', 'Depression')]]
Pattern-based Extraction: [('004', '40', 'BloodPressure'), ('001', '45', 'Diabetes'), ('003', '29', 'Diabetes'), ('002', '60', 'Hypertension'), ('005', '52', 'Depression')]
MachineLearning-based Extraction: [('004', 'BloodPressure', 'BloodPressure'), ('001', 'Diabetes', 'Diabetes'), ('003', 'Diabetes', 'Diabetes'), ('002', 'Hypertension', 'Hypertension'), ('005', 'Depression', 'Depression')]


In [62]:
import pprint

print("Rule-based Extraction:")
pprint.pprint(rule_based_result, indent=4, width=80)

print("\nPattern-based Extraction:")
pprint.pprint(pattern_based_result, indent=4, width=80)

print("\nMachine Learning Extraction:")
pprint.pprint(machinelearning_based_result, indent=4, width=80)

Rule-based Extraction:
[   [('004', '40', 'BloodPressure')],
    [('001', '45', 'Diabetes')],
    [('003', '29', 'Diabetes')],
    [('002', '60', 'Hypertension')],
    [('005', '52', 'Depression')]]

Pattern-based Extraction:
[   ('004', '40', 'BloodPressure'),
    ('001', '45', 'Diabetes'),
    ('003', '29', 'Diabetes'),
    ('002', '60', 'Hypertension'),
    ('005', '52', 'Depression')]

Machine Learning Extraction:
[   ('004', 'BloodPressure', 'BloodPressure'),
    ('001', 'Diabetes', 'Diabetes'),
    ('003', 'Diabetes', 'Diabetes'),
    ('002', 'Hypertension', 'Hypertension'),
    ('005', 'Depression', 'Depression')]
