Single mapping

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd

In [5]:
# Load dataset
df = pd.read_csv('icd_cpt_mapping.csv')
df.head()

Unnamed: 0,ICD Code,ICD Description,CPT Code,CPT Description
0,J44.9,"Heart failure, unspecified",84443,Immunization administration; 1 vaccine (single...
1,K21.9,Encounter for immunization,52000,Office or other outpatient visit for establish...
2,L03.90,"Cellulitis, unspecified",82947,Initial comprehensive preventive medicine eval...
3,M54.5,"Chronic obstructive pulmonary disease, unspeci...",95806,Immunization administration; 1 vaccine (single...
4,E66.9,"Coronary artery disease, unspecified",95806,Therapeutic exercises to develop strength and ...


In [7]:
# Combine ICD code and description for features
df['icd_combined'] = df['ICD Code'] + " " + df['ICD Description']
df['icd_combined']

0                       J44.9 Heart failure, unspecified
1                       K21.9 Encounter for immunization
2                         L03.90 Cellulitis, unspecified
3      M54.5 Chronic obstructive pulmonary disease, u...
4             E66.9 Coronary artery disease, unspecified
                             ...                        
995           M54.5 Coronary artery disease, unspecified
996            E11.9 Chronic kidney disease, unspecified
997          L03.90 Coronary artery disease, unspecified
998                    E78.5 Hyperlipidemia, unspecified
999                     N18.9 Heart failure, unspecified
Name: icd_combined, Length: 1000, dtype: object

In [8]:
# Vectorize the text data (TF-IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['icd_combined'])
X

<1000x86 sparse matrix of type '<class 'numpy.float64'>'
	with 5447 stored elements in Compressed Sparse Row format>

In [11]:
# Target variable (CPT codes)
y = df['CPT Code']
y

0      84443
1      52000
2      82947
3      95806
4      95806
       ...  
995    99285
996    80050
997    93000
998    82947
999    94010
Name: CPT Code, Length: 1000, dtype: int64

In [12]:

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [23]:
# Sample input: combine the ICD code and its description
sample_input = "H66.9 Otitis Media, unspecified"

# Vectorize the input using the same vectorizer used during training
sample_input_vectorized = vectorizer.transform([sample_input])

# Predict the CPT code using the trained model
predicted_cpt = rf.predict(sample_input_vectorized)

predicted_cpt[0]

99285

In [21]:
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       12001       0.00      0.00      0.00        10
       43239       0.00      0.00      0.00         8
       52000       0.17      0.06      0.08        18
       80050       0.00      0.00      0.00         4
       80053       0.14      0.20      0.17        10
       81001       0.11      0.10      0.11        10
       82947       0.17      0.11      0.13         9
       84443       0.20      0.08      0.12        12
       85025       0.10      0.09      0.10        11
       87880       0.09      0.07      0.08        15
       90471       0.00      0.00      0.00        13
       90832       0.00      0.00      0.00         6
       93000       0.11      0.11      0.11         9
       93306       0.00      0.00      0.00        11
       94010       0.00      0.00      0.00         5
       95806       0.00      0.00      0.00        10
       97110       0.05      0.10      0.06        10
       99213       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Multi label mapping

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [29]:
# Load the dataset
df = pd.read_csv('icd_cpt.csv')
df.head()

Unnamed: 0,ICD Code,ICD Description,CPT Codes
0,H66.9,"Otitis Media, unspecified","['93000', '99213', '52000', '12001']"
1,K21.9,Gastro-esophageal reflux disease without esoph...,"['80050', '52000']"
2,N39.0,"Urinary tract infection, site not specified","['94010', '52000', '99213']"
3,J06.9,"Acute upper respiratory infection, unspecified","['93010', '93000']"
4,K21.9,Gastro-esophageal reflux disease without esoph...,"['52000', '81003', '71020']"


In [30]:

# Combine ICD Code and Description
df['icd_combined'] = df['ICD Code'] + " " + df['ICD Description']
df['icd_combined']

0                        H66.9 Otitis Media, unspecified
1      K21.9 Gastro-esophageal reflux disease without...
2      N39.0 Urinary tract infection, site not specified
3      J06.9 Acute upper respiratory infection, unspe...
4      K21.9 Gastro-esophageal reflux disease without...
                             ...                        
995    J06.9 Acute upper respiratory infection, unspe...
996                           I10 Essential hypertension
997                              J44.9 COPD, unspecified
998                              J44.9 COPD, unspecified
999    N39.0 Urinary tract infection, site not specified
Name: icd_combined, Length: 1000, dtype: object

In [32]:
# Vectorize the text data (TF-IDF)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['icd_combined'])
X

<1000x48 sparse matrix of type '<class 'numpy.float64'>'
	with 5642 stored elements in Compressed Sparse Row format>

In [33]:
# Binarize the target labels (CPT Codes)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['CPT Codes'])
y

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [34]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Train a Multi-Output Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_rf = MultiOutputClassifier(rf, n_jobs=-1)
multi_target_rf.fit(X_train, y_train)

In [36]:
# Predict on the test set
y_pred = multi_target_rf.predict(X_test)

# Inverse transform the predicted labels to CPT codes
predicted_cpt_codes = mlb.inverse_transform(y_pred)
actual_cpt_codes = mlb.inverse_transform(y_test)

# Print a few examples
for i in range(5):
    print(f"ICD Code + Description: {df['icd_combined'].iloc[i]}")
    print(f"Actual CPT Codes: {actual_cpt_codes[i]}")
    print(f"Predicted CPT Codes: {predicted_cpt_codes[i]}")
    print("-" * 40)

ICD Code + Description: H66.9 Otitis Media, unspecified
Actual CPT Codes: (' ', "'", ',', '0', '1', '3', '4', '5', '6', '8', '[', ']')
Predicted CPT Codes: (' ', "'", ',', '0', '1', '2', '3', '5', '9', '[', ']')
----------------------------------------
ICD Code + Description: K21.9 Gastro-esophageal reflux disease without esophagitis
Actual CPT Codes: (' ', "'", ',', '0', '1', '4', '5', '8', '9', '[', ']')
Predicted CPT Codes: (' ', "'", ',', '0', '1', '2', '3', '5', '9', '[', ']')
----------------------------------------
ICD Code + Description: N39.0 Urinary tract infection, site not specified
Actual CPT Codes: (' ', "'", ',', '0', '1', '2', '3', '5', '9', '[', ']')
Predicted CPT Codes: (' ', "'", ',', '0', '1', '2', '3', '5', '9', '[', ']')
----------------------------------------
ICD Code + Description: J06.9 Acute upper respiratory infection, unspecified
Actual CPT Codes: (' ', "'", ',', '0', '1', '2', '3', '5', '9', '[', ']')
Predicted CPT Codes: (' ', "'", ',', '0', '1', '2', '3'

In [37]:
# Sample input: combine the ICD code and description
sample_input = "H66.9 Otitis Media, unspecified"

# Vectorize the input using the same vectorizer used during training
sample_input_vectorized = vectorizer.transform([sample_input])

# Predict the CPT codes using the trained model
predicted_cpt = multi_target_rf.predict(sample_input_vectorized)

# Convert the prediction back to CPT codes
predicted_cpt_codes = mlb.inverse_transform(predicted_cpt)

print(f"Predicted CPT Codes for '{sample_input}': {predicted_cpt_codes[0]}")

Predicted CPT Codes for 'H66.9 Otitis Media, unspecified': (' ', "'", ',', '0', '1', '2', '3', '5', '9', '[', ']')


Final

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [44]:
# Load the dataset
df = pd.read_csv('icd_cpt_expanded.csv')

# Ensure all CPT Codes are in list format
def ensure_list(x):
    if isinstance(x, list):
        return x
    elif isinstance(x, str):
        return eval(x) if x.startswith('[') else [x]
    else:
        return [x]

df['CPT Codes'] = df['CPT Codes'].apply(ensure_list)

In [45]:
df['CPT Codes'] = df['CPT Codes'].apply(ensure_list)

# Combine ICD code and description into a single feature
df['ICD_combined'] = df['ICD Code'] + ' ' + df['ICD Description']

# Define input (X) and target (y)
X = df['ICD_combined']
y = df['CPT Codes']

In [46]:
# Use MultiLabelBinarizer to encode the target labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# Create a pipeline with TF-IDF Vectorizer and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('clf', OneVsRestClassifier(LogisticRegression()))  # Multi-label classification
])

# Train the model
pipeline.fit(X_train, y_train)


In [54]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Convert CPT code labels to strings
target_names = list(map(str, mlb.classes_))

# Display the classification report
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

       12001       0.00      0.00      0.00        87
       36415       0.00      0.00      0.00       107
       52000       0.00      0.00      0.00        97
       71020       0.00      0.00      0.00       113
       80050       0.00      0.00      0.00        98
       81003       0.00      0.00      0.00        91
       93000       0.00      0.00      0.00       100
       93010       0.00      0.00      0.00       115
       94010       0.00      0.00      0.00       105
       99213       0.00      0.00      0.00        87

   micro avg       0.00      0.00      0.00      1000
   macro avg       0.00      0.00      0.00      1000
weighted avg       0.00      0.00      0.00      1000
 samples avg       0.00      0.00      0.00      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [57]:
# Example single input
single_input = "H66.9 - Otitis Media, unspecified"

# Get the predicted probabilities for the single input
single_pred_proba = pipeline.predict_proba([single_input])

# Adjust the threshold to 0.3
threshold = 0.3
single_pred_adjusted = (single_pred_proba >= threshold).astype(int)

# Convert the adjusted predictions back to CPT code labels
predicted_cpts_adjusted = mlb.inverse_transform(single_pred_adjusted)

# Display the adjusted result
print("Predicted CPT codes with adjusted threshold:", predicted_cpts_adjusted)

Predicted CPT codes with adjusted threshold: [()]


In [2]:
pip install docx

Note: you may need to restart the kernel to use updated packages.Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
     ---------------------------------------- 0.0/54.9 kB ? eta -:--:--
     ------- -------------------------------- 10.2/54.9 kB ? eta -:--:--
     -------------- ----------------------- 20.5/54.9 kB 217.9 kB/s eta 0:00:01
     ----------------------------------- -- 51.2/54.9 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 54.9/54.9 kB 410.0 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: docx
  Building wheel for docx (setup.py): started
  Building wheel for docx (setup.py): finished with status 'done'
  Created wheel for docx: filename=docx-0.2.4-py3-none-any.whl size=53903 sha256=1209f12b386649664a13e4c20aa71ac1e13e45d37d5771e775d7358d20cce639
  Stored in directory: c:\users\harish bhalaa\appdata\local\pip\cache\wheels\f3\ba\dd\43e


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install exceptions

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement exceptions (from versions: none)
ERROR: No matching distribution found for exceptions

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
   ---------------------------------------- 0.0/244.3 kB ? eta -:--:--
   - -------------------------------------- 10.2/244.3 kB ? eta -:--:--
   --------- ----------------------------- 61.4/244.3 kB 825.8 kB/s eta 0:00:01
   -------------- ------------------------ 92.2/244.3 kB 871.5 kB/s eta 0:00:01
   ---------------------- --------------- 143.4/244.3 kB 853.3 kB/s eta 0:00:01
   ------------------------------ ------- 194.6/244.3 kB 985.7 kB/s eta 0:00:01
   -------------------------------------- 244.3/244.3 kB 936.3 kB/s eta 0:00:00
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import io
from docx import Document
from docx.shared import Inches

In [8]:
doc = Document()
doc.add_heading('Medical Reports', level=1)

<docx.text.paragraph.Paragraph at 0x1b4cc5e7e60>

In [13]:
report = [
('Heart Transplant', 'John Doe', 55, 'Male', 'Hypertension, Diabetes', 'Metformin, Lisinopril',
'End-stage heart failure', ['Echocardiogram: Ejection fraction of 25%', 'Cardiac MRI: Severe left ventricular dilation'],
['Heart transplant evaluation', 'Initiation of diuretics and beta-blockers'],
[5000, 2500, 200000, 15000]),
('Lung Cancer', 'Jane Smith', 62, 'Female', 'Smoker, COPD', 'Albuterol, Prednisone',
'Non-small cell lung cancer (NSCLC)', ['CT Scan: 3.5 cm mass', 'Biopsy: Adenocarcinoma'],
['Chemotherapy: carboplatin and paclitaxel', 'Palliative radiation therapy'],
[4000, 3000, 30000, 15000]),
('Eye Disease', 'Mark Johnson', 48, 'Male', 'Diabetes, Hypertension', 'Metformin, Lisinopril',
'Diabetic retinopathy', ['Fundoscopy: Microaneurysms', 'OCT: Retinal thickness of 400 µm'],
['Laser photocoagulation', 'Anti-VEGF therapy'],
[2000, 1500, 5000, 10000]),
('Kidney Failure', 'Lisa White', 60, 'Female', 'Hypertension, Diabetes', 'Lisinopril, Insulin',
'End-stage renal disease', ['Serum Creatinine: 6.5 mg/dL', 'Urinalysis: 4+ protein levels'],
['Dialysis', 'Kidney transplant evaluation'],
[3500, 2000, 30000, 5000]),
('Brain Tumor', 'Tom Brown', 45, 'Male', 'Headaches, Seizures', 'Anticonvulsants',
'Malignant brain tumor', ['MRI Scan: 4 cm mass', 'Biopsy: Glioblastoma multiforme'],
['Surgery: craniotomy', 'Radiation and chemotherapy'],
[6000, 4000, 50000, 15000])
]

In [18]:
for report in report:
    doc.add_heading(report, level=2)
    doc.add_paragraph(f'Patient Name: {report}')
    doc.add_paragraph(f'Age: {report}')
    doc.add_paragraph(f'Gender: {report}')
    doc.add_paragraph(f'Medical History: {report}')
    doc.add_paragraph(f'Current Medications: {report}')

TypeError: 'in <string>' requires string as left operand, not int

In [16]:
doc.add_heading('Diagnosis', level=3)
doc.add_paragraph(f'The patient was diagnosed with {report}.')
doc.add_paragraph('Diagnosis was confirmed through:')
for test in report:
    doc.add_paragraph(f'- {test}', style='ListBullet')

doc.add_heading('Treatment', level=3)
doc.add_paragraph('The patient is treated with:')
for treatment in report:
    doc.add_paragraph(f'- {treatment}', style='ListBullet')

doc.add_heading('Medical Costs', level=3)
cost_sum = sum(report)
doc.add_paragraph('Breakdown of Costs:')
for cost in report:
    doc.add_paragraph(f'$ {cost}', style='ListBullet')
doc.add_paragraph(f'Total Cost: $ {cost_sum}')
doc.add_paragraph()  # Add space between reports

TypeError: unsupported operand type(s) for +: 'int' and 'tuple'