# Identifying Entities in Healthcare Data

In [None]:
import pathlib
import os

# Install necessary libraries
!pip install pycrf
!pip install sklearn-crfsuite

import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd

model = spacy.load("en_core_web_sm")


In [None]:

# Read train and test data
with open('train_sent', 'r') as f:
    train_words = f.readlines()

with open('train_label', 'r') as f:
    train_labels_by_word = f.readlines()

with open('test_sent', 'r') as f:
    test_words = f.readlines()

with open('test_label', 'r') as f:
    test_labels_by_word = f.readlines()


In [None]:

# Check if word and label counts match
print(f"Train words: {len(train_words)}, Train labels: {len(train_labels_by_word)}")
print(f"Test words: {len(test_words)}, Test labels: {len(test_labels_by_word)}")

# Convert token lists to sentences
def convert_to_sentences(dataset):
    sentences = []
    sentence = ""
    for entity in dataset:
        if entity != '\n':
            sentence += entity.strip() + " "
        else:
            sentences.append(sentence.strip())
            sentence = ""
    return sentences

train_sentences = convert_to_sentences(train_words)
train_labels = convert_to_sentences(train_labels_by_word)
test_sentences = convert_to_sentences(test_words)
test_labels = convert_to_sentences(test_labels_by_word)

print("First 5 training sentences and labels:")
for i in range(5):
    print(train_sentences[i], "\n", train_labels[i], "\n")

print("First 5 test sentences and labels:")
for i in range(5):
    print(test_sentences[i], "\n", test_labels[i], "\n")

# Sentence and label counts
print(f"Train sentences: {len(train_sentences)}, Test sentences: {len(test_sentences)}")
print(f"Train labels: {len(train_labels)}, Test labels: {len(test_labels)}")

# Combine datasets for analysis
combined = train_sentences + test_sentences
print(f"Combined sentences: {len(combined)}")


In [None]:

# Extract NOUN and PROPN tokens
noun_propn = []
pos_tag = []

for sent in combined:
    for token in model(sent):
        if token.pos_ in ['NOUN', 'PROPN']:
            noun_propn.append(token.text)
            pos_tag.append(token.pos_)

print(f"NOUN/PROPN tokens: {len(noun_propn)}")

noun_pos = pd.DataFrame({"NOUN_PROPN": noun_propn, "POS_tag": pos_tag})
print("Top 25 NOUN/PROPN tokens:")
print(noun_pos["NOUN_PROPN"].value_counts().head(25))

# POS tagging example
sentence = train_sentences[1]
words = sentence.split()
position = 2
word = words[position]

print(f"Sentence: {sentence}")
print(f"POS tag (isolated): {model(word)[0].pos_}")

print("POS tags (contextual):")
for token in model(sentence):
    print(f"{token.text} -- {token.pos_}")


In [None]:

# Function for contextual POS tagging
def contextual_pos_tagger(sent_list, position):
    sentence = " ".join(sent_list)
    for i, token in enumerate(model(sentence)):
        if i == position:
            return token.pos_

# Feature extraction for one word
def get_features_for_one_word(sent_list, position):
    word = sent_list[position]
    features = [
        'word.lower=' + word.lower(),
        'word.postag=' + contextual_pos_tagger(sent_list, position),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.isdigit=%s' % word.isdigit(),
        'word.startsWithCapital=%s' % word[0].isupper()
    ]
    if position > 0:
        prev_word = sent_list[position-1]
        features.extend([
            'prev_word.lower=' + prev_word.lower(),
            'prev_word.postag=' + contextual_pos_tagger(sent_list, position-1),
            'prev_word.isupper=%s' % prev_word.isupper(),
            'prev_word.isdigit=%s' % prev_word.isdigit(),
            'prev_word.startsWithCapital=%s' % prev_word[0].isupper()
        ])
    else:
        features.append('BEG')
    if position == len(sent_list)-1:
        features.append('END')
    return features

# Feature extraction for one sentence
def get_features_for_one_sentence(sentence):
    words = sentence.split()
    return [get_features_for_one_word(words, i) for i in range(len(words))]

# Label extraction for one sentence
def get_labels_for_one_sentence(labels):
    return labels.split()

# Check feature and label extraction
example_sentence = train_sentences[5]
print(example_sentence)
print(get_features_for_one_sentence(example_sentence)[:2])

example_labels = get_labels_for_one_sentence(train_labels[5])
print(example_labels)


In [None]:

# Extract features and labels for train and test sets
X_train = [get_features_for_one_sentence(s) for s in train_sentences]
X_test = [get_features_for_one_sentence(s) for s in test_sentences]
Y_train = [get_labels_for_one_sentence(l) for l in train_labels]
Y_test = [get_labels_for_one_sentence(l) for l in test_labels]

# Build and train CRF model
crf = sklearn_crfsuite.CRF(max_iterations=300)
crf.fit(X_train, Y_train)

# Predict and evaluate
Y_pred = crf.predict(X_test)
print("F1 score:", metrics.flat_f1_score(Y_test, Y_pred, average='weighted'))

# Example test sentence and labels
print(f"Sentence: {test_sentences[13]}")
print(f"Actual: {Y_test[13]}")
print(f"Predicted: {Y_pred[13]}")
print(X_test[13])


In [None]:

# Extract diseases and treatments
disease_treatment = {}
for i in range(len(Y_pred)):
    diseases = []
    treatments = []
    for j, label in enumerate(Y_pred[i]):
        if label == 'D':
            diseases.append(X_test[i][j][0].split('=')[1])
        elif label == 'T':
            treatments.append(X_test[i][j][0].split('=')[1])
    for disease in diseases:
        if disease in disease_treatment:
            disease_treatment[disease].extend(treatments)
        else:
            disease_treatment[disease] = treatments

# Clean dictionary
cleaned_dict = {k: v for k, v in disease_treatment.items() if v}


In [None]:

# Convert to dataframe
cleaned_df = pd.DataFrame({"Disease": cleaned_dict.keys(), "Treatments": cleaned_dict.values()})
print(cleaned_df.head())


In [None]:

# Search treatments for a specific disease
search_item = 'hereditary retinoblastoma'
treatments = cleaned_dict.get(search_item, [])
print(f"Treatments for '{search_item}': {', '.join(treatments)}")
