In [None]:
%pip install scikit-learn==1.6.1 matplotlib seaborn pandas

Collecting scikit-learn==1.6.1
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
                                              0.0/11.1 MB ? eta -:--:--
     -                                        0.5/11.1 MB 14.2 MB/s eta 0:00:01
     ----                                     1.3/11.1 MB 16.9 MB/s eta 0:00:01
     -------                                  2.0/11.1 MB 16.1 MB/s eta 0:00:01
     ----------                               2.8/11.1 MB 16.3 MB/s eta 0:00:01
     -------------                            3.8/11.1 MB 17.3 MB/s eta 0:00:01
     -----------------                        4.8/11.1 MB 17.9 MB/s eta 0:00:01
     --------------------                     5.7/11.1 MB 18.4 MB/s eta 0:00:01
     ------------------------                 6.9/11.1 MB 19.2 MB/s eta 0:00:01
     -----------------------------            8.1/11.1 MB 20.0 MB/s eta 0:00:01
     ---------------------------------        9.4/11.1 MB 20.7 MB/s eta 0:00:01
     -----------------------


[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: C:\Users\Marco\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Exploratory data analysis

As usual, we start by taking at look at the dataset.

* IDS_mapping.csv file isn't structured as a normal CSV because it contains multiple tables within one file

In [None]:
# Load the data
ids_mapping = pd.read_csv("./data/IDS_mapping.csv")
pd.set_option('display.max_rows', None)  # Display all rows
ids_mapping

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped
8,,
9,discharge_disposition_id,description


🔍 Solution: Read and Split the File Correctly
Since Pandas can’t read it directly, we need to:

1. Read the file as raw text.
2. Split it into separate dataframes based on blank lines.
3. Process and clean each dataframe.

In [None]:
# Read the raw file
with open("./data/IDS_mapping.csv", "r") as f:
    lines = f.read().split("\n")  # Split by line

📌 Identify the Different Tables in "IDS_mapping.csv"

Each section has:

* A header (column names) followed by data.
* A blank line separating different tables.

We need to extract:

1. admission_type_id mapping
2. discharge_disposition_id mapping
3. admission_source_id mapping

In [None]:
# Find table starting points
tables = []
current_table = []

for line in lines:
    if line != ',':  # If line is NOT a comma
        current_table.append(line)
    else:  # If empty line, save previous table and start new
        if current_table:
            tables.append(current_table)
            current_table = []

# Add last table if exists
if current_table:
    tables.append(current_table)

# Check how many tables were found
print(f"Found {len(tables)} tables")
admission_type_id = tables[0]
discharge_disposition_id = tables[1]
admission_source_id = tables[2]

print(admission_type_id)
print(discharge_disposition_id) 
print(admission_source_id)


Found 3 tables
['admission_type_id,description', '1,Emergency', '2,Urgent', '3,Elective', '4,Newborn', '5,Not Available', '6,NULL', '7,Trauma Center', '8,Not Mapped']
['discharge_disposition_id,description', '1,Discharged to home', '2,Discharged/transferred to another short term hospital', '3,Discharged/transferred to SNF', '4,Discharged/transferred to ICF', '5,Discharged/transferred to another type of inpatient care institution', '6,Discharged/transferred to home with home health service', '7,Left AMA', '8,Discharged/transferred to home under care of Home IV provider', '9,Admitted as an inpatient to this hospital', '10,Neonate discharged to another hospital for neonatal aftercare', '11,Expired', '12,Still patient or expected to return for outpatient services', '13,Hospice / home', '14,Hospice / medical facility', '15,Discharged/transferred within this institution to Medicare approved swing bed', '16,Discharged/transferred/referred another institution for outpatient services', '17,Disc

In [34]:
columns =  admission_type_id[0].split(",")
print(columns)

admission_type_id_data = admission_type_id[1:]
print(admission_type_id_data)

['admission_type_id', 'description']
['1,Emergency', '2,Urgent', '3,Elective', '4,Newborn', '5,Not Available', '6,NULL', '7,Trauma Center', '8,Not Mapped']


In [46]:
import csv

# Split the list into rows (each entry in the list represents a row in the CSV)
rows = [row.split(',') for row in admission_type_id]

# Define the output CSV file name
output_file_admission_type_id = './output_data/admission_type_id.csv'

# Write to CSV
with open(output_file_admission_type_id, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)

print(f"CSV file '{output_file_admission_type_id}' created successfully.")

CSV file './output_data/admission_type_id.csv' created successfully.


In [41]:
import numpy as np

cols_admission_type_id = np.array(columns)
data_admission_type_id = np.array(admission_type_id_data)

print(cols_admission_type_id.shape)
print(data_admission_type_id.T.shape)

df_admission_type_id = pd.DataFrame(data=data_admission_type_id, columns=cols_admission_type_id)

# Print the resulting DataFrame
print(df_admission_type_id)


(2,)
(8,)


ValueError: Shape of passed values is (8, 1), indices imply (8, 2)

In [None]:
# Load dataset
diabetic_data = pd.read_csv("./data/diabetic_data.csv")

# Display first few rows
display(diabetic_data.head(), ids_mapping.head())

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available


In [None]:
print("Diabetic Data Columns:", diabetic_data.columns.tolist())
print("IDS Mapping Columns:", ids_mapping.columns.tolist())

Diabetic Data Columns: ['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted']
IDS Mapping Columns: ['admission_type_id', 'description']


In [None]:
# Find common columns
common_cols = set(diabetic_data.columns) & set(ids_mapping.columns)
print("Common Columns:", common_cols)

Common Columns: {'admission_type_id'}


In [None]:
# Ensure both 'admission_type_id' columns have the same type
diabetic_data["admission_type_id"] = diabetic_data["admission_type_id"].astype(str)
ids_mapping["admission_type_id"] = ids_mapping["admission_type_id"].astype(str)

# Merge datasets for EDA
merged_df = diabetic_data.merge(ids_mapping, on="admission_type_id", how="left")

# Show updated dataset with descriptions
display(merged_df.head())

ValueError: You are trying to merge on int64 and object columns for key 'admission_type_id'. If you wish to proceed you should use pd.concat

In [None]:
# IDS_mapping.csv - contains the labels i am guessing.