In [13]:
import boto3
import pandas as pd

# Define the S3 bucket and key
bucket = "medical-data-merged"
key = "cleaned_encounter_data.csv"

# Create a Boto3 S3 client
s3 = boto3.client('s3')

# Load the CSV from S3 into a pandas DataFrame
s3_uri = f"s3://{bucket}/{key}"
df = pd.read_csv(s3_uri, low_memory=False)

print("test")
# Show the first few rows
df.head(100)


test


Unnamed: 0,ENCOUNTER_ID,PATIENT_ID,GENDER,DOB,ENCOUNTER_REASON,REASONDESCRIPTION,CONDITIONS,ALLERGIES,MEDICATIONS,PROCEDURES,...,Tumor marker Cancer,US Guidance for biopsy of Prostate,Urea Nitrogen,WBC Auto (Bld) [#/Vol],Walnut IgE Ab in Serum,Weight difference [Mass difference] --pre dialysis - post dialysis,Weight-for-length Per age and sex,Wheat IgE Ab in Serum,White oak IgE Ab in Serum,pH of Urine by Test strip
0,d0c40d10-8d87-447e-836e-99d26ad52ea5,034e9e3b-2def-4559-bb2a-7850888ae060,M,1983-11-14,Encounter for symptom,Acute bronchitis (disorder),,,,,...,,,,,,,,,,
1,e88bc3a9-007c-405e-aabc-792a38f4aa2b,034e9e3b-2def-4559-bb2a-7850888ae060,M,1983-11-14,General examination of patient (procedure),,,,,,...,,,,,,,,,,
2,8f104aa7-4ca9-4473-885a-bba2437df588,1d604da9-9a81-4ba9-80c2-de3375d59b40,M,1989-05-25,Encounter for symptom,Sinusitis (disorder),Chronic sinusitis (disorder),,,,...,,,,,,,,,,
3,b85c339a-6076-43ed-b9d0-9cf013dec49d,1d604da9-9a81-4ba9-80c2-de3375d59b40,M,1989-05-25,General examination of patient (procedure),,,,,,...,,,,,,,,,,
4,dae2b7cb-1316-4b78-954f-fa610a6c6d0e,10339b10-3cd1-4ac3-ac13-ec26728cb592,M,1992-06-02,General examination of patient (procedure),,,,,Medication Reconciliation (procedure),...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,eb616162-da15-44f7-a195-53ed4c4c1047,b58731cc-2d8b-4c2d-b327-4cab771af3ef,F,1970-05-16,Consultation for treatment,,,,168 HR Ethinyl Estradiol 0.00146 MG/HR / norel...,,...,,,,,,,,,,
96,49bded60-fccf-470b-867d-988137f27555,b58731cc-2d8b-4c2d-b327-4cab771af3ef,F,1970-05-16,Consultation for treatment,,,,Yaz 28 Day Pack,,...,,,,,,,,,,
97,b3782078-50b5-45d9-b54f-4834a849e516,b58731cc-2d8b-4c2d-b327-4cab771af3ef,F,1970-05-16,General examination of patient (procedure),,,,,,...,,,,,,,,,,
98,543d1f93-4329-414f-b3fb-076b886825f0,b58731cc-2d8b-4c2d-b327-4cab771af3ef,F,1970-05-16,Consultation for treatment,,,,Levora 0.15/30 28 Day Pack,,...,,,,,,,,,,


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# -------------------------------
# 1. Load CSV from S3
# -------------------------------
bucket = "medical-data-merged"
key = "cleaned_encounter_data.csv"
s3_uri = f"s3://{bucket}/{key}"

df = pd.read_csv(s3_uri, low_memory=False)

# -------------------------------
# 2. Clean and Prepare Data
# -------------------------------
# Drop missing labels
df = df.dropna(subset=["CONDITIONS"])

# Keep top 30 most common conditions
top_conditions = df["CONDITIONS"].value_counts().nlargest(30).index
df = df[df["CONDITIONS"].isin(top_conditions)]

# Merge symptom-related text into one column
df[["REASONDESCRIPTION", "ENCOUNTER_REASON"]] = df[["REASONDESCRIPTION", "ENCOUNTER_REASON"]].fillna("")
df["symptoms"] = df["REASONDESCRIPTION"] + " " + df["ENCOUNTER_REASON"]

# Features and label
X = df["symptoms"]
y = df["CONDITIONS"]

# -------------------------------
# 3. Train/Test Split and Vectorization
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# -------------------------------
# 4. Train the Model
# -------------------------------
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# -------------------------------
# 5. Evaluate the Model
# -------------------------------
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, zero_division=0))

# -------------------------------
# 6. Save for Deployment
# -------------------------------
joblib.dump(model, "model.joblib")
joblib.dump(vectorizer, "vectorizer.joblib")
print("✅ Model and vectorizer saved.")


                                             precision    recall  f1-score   support

       Acute bacterial sinusitis (disorder)       0.00      0.00      0.00        19
                Acute bronchitis (disorder)       1.00      1.00      1.00       104
         Acute viral pharyngitis (disorder)       1.00      1.00      1.00       136
                          Anemia (disorder)       0.00      0.00      0.00        31
    Body mass index 30+ - obesity (finding)       0.32      0.99      0.49        90
Chronic congestive heart failure (disorder)       1.00      1.00      1.00        13
  Chronic intractable migraine without aura       1.00      1.00      1.00        19
                               Chronic pain       1.00      1.00      1.00         8
               Chronic sinusitis (disorder)       0.57      1.00      0.73        41
   Concussion with no loss of consciousness       0.00      0.00      0.00        14
                     Coronary Heart Disease       0.00      0.00

In [14]:
# import boto3

# bucket = "medical-data-merged"  # your S3 bucket
# model_key = "models/model.joblib"

# s3 = boto3.client('s3')
# s3.upload_file("model.joblib", bucket, model_key)

# model_s3_uri = f"s3://{bucket}/{model_key}"
# print("✅ Model uploaded to:", model_s3_uri)


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sagemaker.sklearn import SKLearnModel
from sagemaker import get_execution_role
import joblib

bucket = "medical-data-merged"
key = "cleaned_encounter_data.csv"
s3_uri = f"s3://{bucket}/{key}"
df = pd.read_csv(s3_uri, low_memory=False)

df = df.dropna(subset=['REASONDESCRIPTION', "CONDITIONS"])
features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

df[["REASONDESCRIPTION", "ENCOUNTER_REASON"]] = df[["REASONDESCRIPTION", "ENCOUNTER_REASON"]].fillna("")
df["symptoms"] = df["REASONDESCRIPTION"] + " " + df["ENCOUNTER_REASON"]

X = df["symptoms"]
y = df["CONDITIONS"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, zero_division=0))

joblib.dump(model, 'model.joblib')  # Save the model

                                                                                   precision    recall  f1-score   support

                                             Acute bacterial sinusitis (disorder)       0.00      0.00      0.00        13
                                                      Acute bronchitis (disorder)       0.98      1.00      0.99       100
                                               Acute viral pharyngitis (disorder)       0.99      1.00      1.00       139
                                                                Anemia (disorder)       0.00      0.00      0.00         1
                                                             Antepartum eclampsia       0.62      1.00      0.77         5
                                                                     Appendicitis       0.80      1.00      0.89         4
                                                                Atopic dermatitis       1.00      1.00      1.00         3
               

['model.joblib']