## CLIF Mapper

An ML model to map your EHR data to CLIF mCIDE.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
#Enter the location for your CLIF-1.0 directory
root_location = '/Users/kavenchhikara/Desktop/CLIF/CLIF-1.0'
organism_data_path = f"{root_location}/mCIDE/ucmc/clif_vocab_microbiology_organism_ucmc.csv"
fluid_data_path = f"{root_location}/mCIDE/ucmc/clif_vocab_microbiology_fluid_ucmc.csv"

In [5]:
# Load Data
organism_data = pd.read_csv(organism_data_path)

In [6]:
organism_data.head()

Unnamed: 0,ord_value,organism_name,organism_category
0,^acinetobacter baumanii for susceptibility res...,acinetobacter_baumanii,"acinetobacter (baumanii, calcoaceticus, lwoffi..."
1,^acinetobacter baumanii for susceptibility res...,acinetobacter_baumanii,"acinetobacter (baumanii, calcoaceticus, lwoffi..."
2,^acinetobacter baumanii in both antibiotic res...,acinetobacter_baumanii,"acinetobacter (baumanii, calcoaceticus, lwoffi..."
3,"10,000 acinetobacter baumanii",acinetobacter_baumanii,"acinetobacter (baumanii, calcoaceticus, lwoffi..."
4,"10,000 cgt acinetobacter baumanii",acinetobacter_baumanii,"acinetobacter (baumanii, calcoaceticus, lwoffi..."


In [7]:
#Clean data
organism_data = organism_data.dropna(subset=['ord_value'])  # Remove rows with NaN in 'ord_value'

In [8]:
# Preprocess Data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(organism_data['ord_value'])
y_organism = organism_data['organism_name']
y_category = organism_data['organism_category']

In [9]:
# Check Class Distribution
print(y_organism.value_counts())

organism_name
staphylococcus_aureus        311
pseudomonas_aeruginosa       238
no growth                    125
enterococcus_faecium         114
enterococcus_faecalis        102
                            ... 
chromobacterium_violaceum      1
chryseobacterium_gleum         1
bacteroides_massiliensis       1
chryseobacterium_sp            1
capnocytophaga_sputigena       1
Name: count, Length: 942, dtype: int64


In [10]:
print(y_category.value_counts())

organism_category
other bacteria                                              477
staphylococcus (coag +)                                     317
enterococcus (all species)                                  306
pseudomonas (all species except cepacia and maltophilia)    275
streptococcus (all species except enterococcus)             200
                                                           ... 
adenovirus                                                    1
herpes zoster (chicken pox, varicella)                        1
enterovirus (coxsackie, echo, polio)                          1
cytomegalovirus (cmv)                                         1
cryptosporidium                                               1
Name: count, Length: 74, dtype: int64


In [11]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y_organism, test_size=0.2, random_state=42)

In [12]:
# Train Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [13]:
# Predict
y_pred = model.predict(X_test)

In [14]:
results = classification_report(y_test, y_pred, zero_division=0)

In [15]:
# Evaluate
print(results)

                                                            precision    recall  f1-score   support

                                            abiotrophia_sp       0.00      0.00      0.00         1
                               achromobacter_denitrificans       0.00      0.00      0.00         1
                                          achromobacter_sp       0.00      0.00      0.00         1
                                achromobacter_xylosoxidans       0.60      1.00      0.75         3
                                        acidaminococcus_sp       0.00      0.00      0.00         1
                                    acinetobacter_baumanii       0.78      1.00      0.88        14
                                       acinetobacter_junii       0.00      0.00      0.00         1
                                      acinetobacter_parvus       0.00      0.00      0.00         1
                                          acinetobacter_sp       0.00      0.00      0.00         2

In [16]:
def predict_organism_name(ord_value):
    # Preprocess the input
    input_vector = vectorizer.transform([ord_value])
    
    # Predict using the trained model
    prediction = model.predict(input_vector)[0]
    
    # Calculate similarity score
    similarity_scores = cosine_similarity(input_vector, X_test).flatten()
    max_similarity_index = np.argmax(similarity_scores)
    max_similarity_score = similarity_scores[max_similarity_index]
    
    return prediction, max_similarity_score

In [18]:
# Example 1
ord_value = "acid fast bacilli"
result, accuracy = predict_organism_name(ord_value)
print(f"Predicted organism name: {result}, Accuracy: {round(accuracy,2)}")

Predicted organism name: tuberculosis (nos, afb, acid fast bacillus, koch bacillus), Accuracy: 0.88


In [19]:
#Example 2
ord_value = "400,000 ACINETOBACTER BAUMANII STRAIN 2"
result, accuracy = predict_organism_name(ord_value)
print(f"Predicted organism name: {result}, Accuracy: {round(accuracy,2)}")

Predicted organism name: acinetobacter_baumanii, Accuracy: 1.0


In [20]:
def predict_organism_category(ord_value):
    # Preprocess the input
    input_vector = vectorizer.transform([ord_value])
    
    # Predict using the trained model
    predicted_organism_name = model.predict(input_vector)[0]
    
    # Lookup the organism category
    organism_category = organism_data.loc[organism_data['organism_name'] == predicted_organism_name, 'organism_category'].values[0]
    
    # Calculate similarity score
    similarity_scores = cosine_similarity(input_vector, vectorizer.transform(organism_data['ord_value'])).flatten()
    max_similarity_score = np.max(similarity_scores)
    
    return predicted_organism_name, organism_category, max_similarity_score

def process_input_file(input_file, output_file):
    # Read input file
    input_data = pd.read_csv(input_file)
    
    # Ensure the input column is named 'ord_value'
    if 'ord_value' not in input_data.columns:
        raise ValueError("Input file must contain 'ord_value' column")
    
    # Initialize lists to store results
    predictions = []
    categories = []
    accuracies = []
    
    # Iterate over each ord_value and predict
    for ord_value in input_data['ord_value']:
        prediction, category, accuracy = predict_organism_category(ord_value)
        predictions.append(prediction)
        categories.append(category)
        accuracies.append(accuracy)
    
    # Add results to DataFrame
    input_data['predicted_organism_name'] = predictions
    input_data['predicted_organism_category'] = categories
    input_data['accuracy'] = accuracies
    
    # Write results to output file
    input_data.to_csv(output_file, index=False)

In [21]:
input_file = 'test.csv'
output_file = 'test_predictions.csv'
process_input_file(input_file, output_file)