In [None]:
from faker import Faker
import pandas as pd
import numpy as np
import random
import numpy as np
# Initialize Faker for generating fake categorical data
fake = Faker()

In [None]:
!pip install faker

Collecting faker
  Downloading Faker-30.8.2-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.8.2-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.8 MB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m28.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.8.2


In [None]:
# Load the original data for reference
file_path = 'clean_balance_data.csv'
original_data = pd.read_csv(file_path)

In [None]:
original_data.gender.unique()

array(['female', 'unknown', 'male'], dtype=object)

In [None]:
original_data['gender'] = original_data['gender'].replace('unknown', 'others')

original_data['gender'].unique()

array(['female', 'others', 'male'], dtype=object)

In [None]:
# Replace 'emergency room admission' with 'emergency room' in the 'admission_location' column
original_data['admission_location'] = original_data['admission_location'].replace('emergency room admission', 'emergency room')

# Verify the changes by checking the unique values in the 'admission_location' column
original_data['admission_location'].unique()

array(['emergency room',
       'transfer from another hospital or external facility',
       'physician referral / normal delivery',
       'clinic referral / premature issue',
       'transfer from skilled nursing facility'], dtype=object)

In [None]:
# Define a function to clean the 'prior_admissions' column
def clean_prior_admissions(value):
    if pd.isna(value):
        return np.nan
    if value.strip().lower() == 'no':
        return 0
    try:
        # Extract the numeric part and convert to float
        numeric_part = float(value.split()[0])
        # Round the number to the nearest integer
        return int(round(numeric_part))
    except:
        return np.nan

# Apply the cleaning function to the 'prior_admissions' column
original_data['prior_admissions'] = original_data['prior_admissions'].apply(clean_prior_admissions)

# Check for any remaining NaN values and decide how to handle them
nan_count = original_data['prior_admissions'].isna().sum()

# Output the unique values to verify the cleaning
cleaned_values = original_data['prior_admissions'].unique()

(nan_count, cleaned_values)

(0, array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 13]))

In [None]:
original_data['prior_admissions']

Unnamed: 0,prior_admissions
0,0
1,0
2,0
3,0
4,0
...,...
231,1
232,1
233,0
234,0


In [None]:
# Columns that you want to round to two decimal places
columns_to_round = ['lab_mean_50902', 'lab_mean_50882', 'lab_mean_50868', 'lab_mean_50912',
                    'lab_last_50902', 'lab_last_50882', 'lab_last_50868', 'lab_last_50912']

# Apply rounding
for column in columns_to_round:
    original_data[column] = original_data[column].round(2)

# Display the DataFrame to check the changes
print(original_data[columns_to_round].head())

   lab_mean_50902  lab_mean_50882  lab_mean_50868  lab_mean_50912  \
0          100.38           26.25           16.75            4.70   
1          112.37           23.95            9.68            0.56   
2           99.00           29.00           13.00            1.70   
3          101.50           28.20           12.90            0.42   
4           94.00           14.00           34.00            5.80   

   lab_last_50902  lab_last_50882  lab_last_50868  lab_last_50912  
0           104.0            24.0            15.0             3.7  
1           117.0            25.0             7.0             0.9  
2           100.0            29.0            13.0             1.7  
3           102.0            28.0            10.0             0.4  
4            83.0            12.0            46.0             4.0  


In [None]:
# Count the number of deceased patients
number_of_deceased = original_data[original_data['discharge_location'] == 'deceased'].shape[0]

print(f"Number of deceased patients: {number_of_deceased}")

Number of deceased patients: 40


In [None]:
# Filter the original data to include only deceased patients
deceased_data = original_data[original_data['discharge_location'] == 'deceased']

# Count the number of deceased patients grouped by the number of prior admissions
prior_admissions_count = deceased_data['prior_admissions'].value_counts()

# Sort the index for better readability (to display in order of prior admissions counts)
prior_admissions_count_sorted = prior_admissions_count.sort_index()

# Print the results
print("Number of deceased patients by prior admissions:")
print(prior_admissions_count_sorted)

Number of deceased patients by prior admissions:
prior_admissions
0    33
1     7
Name: count, dtype: int64


In [None]:
# Drop rows where discharge location indicates the patient has died
original_data = original_data[original_data['discharge_location'] != 'deceased']

# Confirm the operation by checking the unique values in 'discharge_location'
remaining_discharge_locations = original_data['discharge_location'].unique()

# Display the remaining unique discharge locations and the shape of the new dataset
remaining_discharge_locations, original_data.shape

(array(['home health care', 'skilled nursing facility',
        'rehabilitation or distinct part hospital', 'home',
        'hospice care at home', 'transferred to psychiatric hospital',
        'home with IV services', 'long-term care hospital',
        'intermediate care facility'], dtype=object),
 (196, 32))

In [None]:
# Round the 'age_at_admission' column to the nearest whole integer
original_data['age_at_admission'] = original_data['age_at_admission'].round().astype(int)

# Display the unique values to confirm the rounding
original_data['age_at_admission'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_data['age_at_admission'] = original_data['age_at_admission'].round().astype(int)


array([71, 74, 77, 78, 88, 82, 76, 85, 83, 28, 70, 64, 73, 66, 81, 65, 67,
       87, 86, 49, 55, 44, 80, 54, 45, 53])

In [None]:
original_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196 entries, 0 to 235
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   subject_id                  196 non-null    float64
 1   hadm_id                     196 non-null    float64
 2   admission_type              196 non-null    object 
 3   admission_location          196 non-null    object 
 4   discharge_location          196 non-null    object 
 5   insurance                   196 non-null    object 
 6   marital_status              196 non-null    object 
 7   diagnosis                   196 non-null    object 
 8   has_chartevents_data        196 non-null    object 
 9   readmitted_within_30_days   196 non-null    object 
 10  gender                      196 non-null    object 
 11  age_at_admission            196 non-null    int64  
 12  lab_mean_50868              196 non-null    float64
 13  lab_mean_50882              196 non-null

In [None]:
# Save cleaned dataset to CSV
original_data.to_csv('clean.csv', index=False)

In [None]:
df = pd.read_csv('clean.csv')

In [None]:
# Define the size of synthetic data to generate
num_samples = 1000

# Define ranges and unique values based on original data
demographics = {
    "gender": ["male", "female", "others"],
    "marital_status": df["marital_status"].dropna().unique(),
    "insurance": df["insurance"].dropna().unique(),
    "admission_type": df["admission_type"].dropna().unique(),
    "admission_location": df["admission_location"].dropna().unique(),
    "discharge_location": df["discharge_location"].dropna().unique()
}

# Initialize synthetic data dictionary
synthetic_data = {
    "subject_id": np.arange(1, num_samples + 1),
    "hadm_id": np.arange(100000, 100000 + num_samples),
    "gender": np.random.choice(demographics["gender"], num_samples),
    "marital_status": np.random.choice(demographics["marital_status"], num_samples),
    "insurance": np.random.choice(demographics["insurance"], num_samples),
    "admission_type": np.random.choice(demographics["admission_type"], num_samples),
    "admission_location": np.random.choice(demographics["admission_location"], num_samples),
    "discharge_location": np.random.choice(demographics["discharge_location"], num_samples),
    "age_at_admission": np.random.normal(df["age_at_admission"].mean(), df["age_at_admission"].std(), num_samples),
    "readmitted_within_30_days": np.random.choice(["yes", "no"], num_samples, p=[0.3, 0.7])
}

# Lab results and vital signs, generated with slight variations around original data averages
lab_columns = [col for col in df.columns if col.startswith("lab_")]
vital_columns = [col for col in df.columns if col.startswith("vital_")]

for lab_col in lab_columns:
    synthetic_data[lab_col] = np.random.normal(df[lab_col].mean(), df[lab_col].std(), num_samples)

for vital_col in vital_columns:
    synthetic_data[vital_col] = np.random.normal(df[vital_col].mean(), df[vital_col].std(), num_samples)

# Additional columns with specific labels
synthetic_data["diagnosis"] = np.random.choice(df["diagnosis"].dropna().unique(), num_samples)
synthetic_data["prior_admissions"] = np.random.randint(0, 5, num_samples)  # Assume up to 5 prior admissions
synthetic_data["length_of_stay_label"] = np.random.choice(df["length_of_stay_label"].dropna().unique(), num_samples)
synthetic_data["medication_diversity_label"] = np.random.choice(df["medication_diversity_label"].dropna().unique(), num_samples)
synthetic_data["emergency_duration_label"] = np.random.choice(df["emergency_duration_label"].dropna().unique(), num_samples)

# Convert to DataFrame
synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df.head()

Unnamed: 0,subject_id,hadm_id,gender,marital_status,insurance,admission_type,admission_location,discharge_location,age_at_admission,readmitted_within_30_days,...,vital_mean_224690,vital_last_220045,vital_last_220179,vital_last_223762,vital_last_224690,diagnosis,prior_admissions,length_of_stay_label,medication_diversity_label,emergency_duration_label
0,1,100000,male,married,medicare,emergency,physician referral / normal delivery,rehabilitation or distinct part hospital,68.510717,yes,...,18.307382,78.577336,109.833173,36.293157,20.187447,brain metastasis; metastatic melanoma,1,long stay of 13 days,low medication diversity of 16 unique medications,short duration of 3.05 hours
1,2,100001,male,unknown (default),medicaid,urgent,physician referral / normal delivery,home,76.99913,yes,...,18.100136,76.146639,108.328157,36.293266,19.114767,pyelonephritis; urinary tract infection,1,medium stay of 6 days,moderate medication diversity of 36 unique med...,short duration of 3.92 hours
2,3,100002,female,divorced,medicare,emergency,transfer from skilled nursing facility,long-term care hospital,101.958656,no,...,17.442871,84.676146,108.918043,36.296912,19.348394,elevated liver functions; p liver transplant,4,short stay of 1 days,low medication diversity of 19 unique medications,short duration of 3.47 hours
3,4,100003,others,married,medicare,elective,emergency room,hospice care at home,75.37535,yes,...,23.087079,96.884534,115.726145,36.29329,18.290065,abscess,3,very long stay of 124 days,moderate medication diversity of 42 unique med...,short duration of 1.40 hours
4,5,100004,male,unknown,medicaid,emergency,physician referral / normal delivery,transferred to psychiatric hospital,78.379346,yes,...,16.645199,83.024581,129.859433,36.306749,17.723892,pneumonia; sepsis; telemetry,2,very long stay of 40 days,moderate medication diversity of 45 unique med...,short duration of 3.92 hours


In [None]:
synthetic_df.shape

(1000, 31)

In [None]:
synthetic_df.discharge_location.unique()

array(['rehabilitation or distinct part hospital', 'home',
       'long-term care hospital', 'hospice care at home',
       'transferred to psychiatric hospital', 'home with IV services',
       'intermediate care facility', 'home health care',
       'skilled nursing facility'], dtype=object)

In [None]:
synthetic_df['discharge_location'] = synthetic_df['discharge_location'].replace('hospice care at home', 'hospital care at home')

synthetic_df['discharge_location'].unique()

array(['rehabilitation or distinct part hospital', 'home',
       'long-term care hospital', 'hospital care at home',
       'transferred to psychiatric hospital', 'home with IV services',
       'intermediate care facility', 'home health care',
       'skilled nursing facility'], dtype=object)

In [None]:
# Check for duplicate rows in the synthetic data
duplicate_rows = synthetic_df.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

Number of duplicate rows: 0


In [None]:
synthetic_df.head(50)

Unnamed: 0,subject_id,hadm_id,gender,marital_status,insurance,admission_type,admission_location,discharge_location,age_at_admission,readmitted_within_30_days,...,vital_mean_224690,vital_last_220045,vital_last_220179,vital_last_223762,vital_last_224690,diagnosis,prior_admissions,length_of_stay_label,medication_diversity_label,emergency_duration_label
0,1,100000,male,married,medicare,emergency,physician referral / normal delivery,rehabilitation or distinct part hospital,68.510717,yes,...,18.307382,78.577336,109.833173,36.293157,20.187447,brain metastasis; metastatic melanoma,1,long stay of 13 days,low medication diversity of 16 unique medications,short duration of 3.05 hours
1,2,100001,male,unknown (default),medicaid,urgent,physician referral / normal delivery,home,76.99913,yes,...,18.100136,76.146639,108.328157,36.293266,19.114767,pyelonephritis; urinary tract infection,1,medium stay of 6 days,moderate medication diversity of 36 unique med...,short duration of 3.92 hours
2,3,100002,female,divorced,medicare,emergency,transfer from skilled nursing facility,long-term care hospital,101.958656,no,...,17.442871,84.676146,108.918043,36.296912,19.348394,elevated liver functions; p liver transplant,4,short stay of 1 days,low medication diversity of 19 unique medications,short duration of 3.47 hours
3,4,100003,others,married,medicare,elective,emergency room,hospital care at home,75.37535,yes,...,23.087079,96.884534,115.726145,36.29329,18.290065,abscess,3,very long stay of 124 days,moderate medication diversity of 42 unique med...,short duration of 1.40 hours
4,5,100004,male,unknown,medicaid,emergency,physician referral / normal delivery,transferred to psychiatric hospital,78.379346,yes,...,16.645199,83.024581,129.859433,36.306749,17.723892,pneumonia; sepsis; telemetry,2,very long stay of 40 days,moderate medication diversity of 45 unique med...,short duration of 3.92 hours
5,6,100005,others,married,medicaid,elective,transfer from skilled nursing facility,long-term care hospital,88.491184,no,...,21.905515,78.038833,135.774963,36.300512,18.956613,hypoglycemia,4,long stay of 17 days,high medication diversity of 54 unique medicat...,moderate duration of 4.22 hours
6,7,100006,male,widowed,private,elective,transfer from skilled nursing facility,home with IV services,80.477914,no,...,20.244522,76.020637,129.769372,36.286592,18.910242,heart failure,1,long stay of 10 days,moderate medication diversity of 40 unique med...,moderate duration of 4.18 hours
7,8,100007,male,unknown,medicaid,elective,emergency room,home,68.213579,no,...,19.435224,74.955767,130.0489,36.303883,17.050203,pneumonia,0,medium stay of 6 days,low medication diversity of 20 unique medications,moderate duration of 5.40 hours
8,9,100008,female,divorced,private,urgent,emergency room,long-term care hospital,76.964308,no,...,17.082218,85.032821,127.329837,36.305144,19.068067,hypoglycemia; pneumonia; syncope,2,short stay of 1 days,low medication diversity of 19 unique medications,short duration of 3.28 hours
9,10,100009,others,single,medicare,elective,transfer from skilled nursing facility,home,55.123972,no,...,28.160194,90.368501,119.661641,36.297401,17.296469,asthma; copd flare,0,short stay of 2 days,low medication diversity of 18 unique medications,short duration of 1.43 hours


In [None]:
synthetic_df.marital_status.unique()

array(['married', 'unknown (default)', 'divorced', 'unknown', 'widowed',
       'single', 'separated'], dtype=object)

In [None]:
# Standardize "unknown (default)" to "unknown" in the original dataset
df["marital_status"] = df["marital_status"].replace("unknown (default)", "unknown")

# Define updated unique marital status options after standardization
marital_status_options = df["marital_status"].dropna().unique()

# Update marital status in the synthetic data generation
synthetic_df["marital_status"] = np.random.choice(marital_status_options, len(synthetic_df))


In [None]:
synthetic_df.marital_status.unique()

array(['unknown', 'widowed', 'separated', 'divorced', 'single', 'married'],
      dtype=object)

In [None]:
# Columns that you want to round to two decimal places
columns_to_round = ['lab_mean_50902', 'lab_mean_50882', 'lab_mean_50868', 'lab_mean_50912',
                    'lab_last_50902', 'lab_last_50882', 'lab_last_50868', 'lab_last_50912']

# Apply rounding
for column in columns_to_round:
    synthetic_df[column] = synthetic_df[column].round(2)

# Display the DataFrame to check the changes
print(synthetic_df[columns_to_round].head())

   lab_mean_50902  lab_mean_50882  lab_mean_50868  lab_mean_50912  \
0          109.01           25.36           12.68            0.17   
1          106.95           34.80           11.96            1.33   
2          104.15           25.05           16.73            0.61   
3          103.76           33.53           12.31            0.98   
4          101.16           29.00           11.46           -0.56   

   lab_last_50902  lab_last_50882  lab_last_50868  lab_last_50912  
0          114.08           34.97           16.74            1.35  
1          103.26           30.30           12.79            0.26  
2           99.10           26.43            8.46            0.09  
3           93.10           29.46           18.60            0.26  
4           94.54           18.79           17.02            1.40  


In [None]:
# Round the 'age_at_admission' column to the nearest whole integer
synthetic_df['age_at_admission'] = synthetic_df['age_at_admission'].round().astype(int)

# Display the unique values to confirm the rounding
synthetic_df['age_at_admission'].unique()

array([ 69,  77, 102,  75,  78,  88,  80,  68,  55,  53,  83,  61,  73,
        64,  56,  74,  99,  51,  72,  86,  71,  65,  62,  79,  42,  81,
        70,  76,  52,  60,  90,  63,  49,  67,  85,  82,  87,  54,  84,
        97,  48,  66,  58,  92,  98,  59,  89,  93,  96,  43,  91,  95,
        45,  94,  50,  57,  46, 101,  47, 100])

In [None]:
# Columns that you want to round to two decimal places
columns_to_round = ['vital_mean_220045', 'vital_mean_220179', 'vital_mean_223762', 'vital_mean_224690',
                    'vital_last_220045', 'vital_last_220179', 'vital_last_223762', 'vital_last_224690']

# Apply rounding
for column in columns_to_round:
    synthetic_df[column] = synthetic_df[column].round(2)

# Display the DataFrame to check the changes
print(synthetic_df[columns_to_round].head())

   vital_mean_220045  vital_mean_220179  vital_mean_223762  vital_mean_224690  \
0              97.42             116.25              37.44              18.31   
1              74.19             138.76              37.45              18.10   
2              84.18             125.30              37.45              17.44   
3              66.12             119.59              37.44              23.09   
4              89.52             130.69              37.44              16.65   

   vital_last_220045  vital_last_220179  vital_last_223762  vital_last_224690  
0              78.58             109.83              36.29              20.19  
1              76.15             108.33              36.29              19.11  
2              84.68             108.92              36.30              19.35  
3              96.88             115.73              36.29              18.29  
4              83.02             129.86              36.31              17.72  


In [None]:
# Check for any entries in 'age_at_admission' where age is above 98
age_above_98 = synthetic_df[synthetic_df['age_at_admission'] > 98]

# Display the result
age_above_98[['age_at_admission']]

Unnamed: 0,age_at_admission
2,102
22,99
526,101
644,101
751,101
798,99
815,100
905,99
921,101
977,102


In [None]:
# Remove entries where 'age_at_admission' is above 98
synthetic_df = synthetic_df[synthetic_df['age_at_admission'] <= 98]

# Verify the changes by checking if any age values are above 98
synthetic_df['age_at_admission'].max()

98

In [None]:
synthetic_df.shape

(990, 31)

In [None]:
synthetic_df.length_of_stay_label.unique()

array(['long stay of 13 days', 'medium stay of 6 days',
       'very long stay of 124 days', 'very long stay of 40 days',
       'long stay of 17 days', 'long stay of 10 days',
       'short stay of 1 days', 'short stay of 2 days',
       'long stay of 14 days', 'long stay of 9 days',
       'medium stay of 4 days', 'long stay of 25 days',
       'long stay of 20 days', 'long stay of 12 days',
       'long stay of 11 days', 'medium stay of 5 days',
       'long stay of 22 days', 'very long stay of 36 days',
       'short stay of 3 days', 'long stay of 8 days',
       'medium stay of 7 days'], dtype=object)

In [None]:
# Check for entries in 'length_of_stay_label' with either "very long stay of 40 days" or "very long stay of 124 days"
stay_40_or_124 = synthetic_df[
    synthetic_df['length_of_stay_label'].str.contains('very long stay of 40 days|very long stay of 124 days', case=False, na=False)
]

# Count the rows
row_count = stay_40_or_124[['length_of_stay_label']].shape[0]

# Display the count
row_count


119

In [None]:
# Remove rows where length_of_stay_label is either "very long stay of 40 days" or "very long stay of 124 days"
synthetic_df = synthetic_df[~synthetic_df['length_of_stay_label'].str.contains('very long stay of 40 days|very long stay of 124 days', case=False, na=False)]

In [None]:
synthetic_df.head(50)

Unnamed: 0,subject_id,hadm_id,gender,marital_status,insurance,admission_type,admission_location,discharge_location,age_at_admission,readmitted_within_30_days,...,vital_mean_224690,vital_last_220045,vital_last_220179,vital_last_223762,vital_last_224690,diagnosis,prior_admissions,length_of_stay_label,medication_diversity_label,emergency_duration_label
0,1,100000,male,unknown,medicare,emergency,physician referral / normal delivery,rehabilitation or distinct part hospital,69,yes,...,18.31,78.58,109.83,36.29,20.19,brain metastasis; metastatic melanoma,1,long stay of 13 days,low medication diversity of 16 unique medications,short duration of 3.05 hours
1,2,100001,male,widowed,medicaid,urgent,physician referral / normal delivery,home,77,yes,...,18.1,76.15,108.33,36.29,19.11,pyelonephritis; urinary tract infection,1,medium stay of 6 days,moderate medication diversity of 36 unique med...,short duration of 3.92 hours
5,6,100005,others,single,medicaid,elective,transfer from skilled nursing facility,long-term care hospital,88,no,...,21.91,78.04,135.77,36.3,18.96,hypoglycemia,4,long stay of 17 days,high medication diversity of 54 unique medicat...,moderate duration of 4.22 hours
6,7,100006,male,divorced,private,elective,transfer from skilled nursing facility,home with IV services,80,no,...,20.24,76.02,129.77,36.29,18.91,heart failure,1,long stay of 10 days,moderate medication diversity of 40 unique med...,moderate duration of 4.18 hours
7,8,100007,male,married,medicaid,elective,emergency room,home,68,no,...,19.44,74.96,130.05,36.3,17.05,pneumonia,0,medium stay of 6 days,low medication diversity of 20 unique medications,moderate duration of 5.40 hours
8,9,100008,female,widowed,private,urgent,emergency room,long-term care hospital,77,no,...,17.08,85.03,127.33,36.31,19.07,hypoglycemia; pneumonia; syncope,2,short stay of 1 days,low medication diversity of 19 unique medications,short duration of 3.28 hours
9,10,100009,others,separated,medicare,elective,transfer from skilled nursing facility,home,55,no,...,28.16,90.37,119.66,36.3,17.3,asthma; copd flare,0,short stay of 2 days,low medication diversity of 18 unique medications,short duration of 1.43 hours
11,12,100011,female,widowed,private,urgent,physician referral / normal delivery,transferred to psychiatric hospital,88,yes,...,20.34,83.61,108.9,36.31,16.85,unstable angina,3,long stay of 14 days,low medication diversity of 29 unique medications,moderate duration of 4.72 hours
12,13,100012,male,divorced,private,urgent,emergency room,hospital care at home,53,no,...,21.83,80.85,140.6,36.28,17.31,failure to thrive,1,long stay of 9 days,low medication diversity of 28 unique medications,moderate duration of 5.73 hours
13,14,100013,male,widowed,medicare,emergency,transfer from another hospital or external fac...,long-term care hospital,53,yes,...,18.4,77.67,111.51,36.29,16.11,pericardial effusion,4,medium stay of 4 days,moderate medication diversity of 32 unique med...,moderate duration of 4.23 hours


In [None]:
# Check for duplicate rows in the synthetic data
duplicate_rows = synthetic_df.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

Number of duplicate rows: 0


In [None]:
# Check distribution of 'readmitted_within_30_days' column
readmission_counts = synthetic_df['readmitted_within_30_days'].value_counts()
print(readmission_counts)


readmitted_within_30_days
no     612
yes    259
Name: count, dtype: int64


In [None]:
# Separate majority and minority classes
majority_class = synthetic_df[synthetic_df['readmitted_within_30_days'] == 'no']
minority_class = synthetic_df[synthetic_df['readmitted_within_30_days'] == 'yes']

# Oversample the minority class with slight variations
oversampled_minority = minority_class.sample(len(majority_class), replace=True, random_state=42).copy()

# Add slight variations to numeric columns in oversampled data
numeric_cols = oversampled_minority.select_dtypes(include=np.number).columns
for col in numeric_cols:
    oversampled_minority[col] += np.random.normal(0, 0.01, oversampled_minority[col].shape)  # Adjust noise level as needed

# Combine the oversampled minority with the majority class
balance = pd.concat([majority_class, oversampled_minority], axis=0).reset_index(drop=True)

# Verify the new class distribution
print("New class distribution:")
print(balance['readmitted_within_30_days'].value_counts())


New class distribution:
readmitted_within_30_days
no     612
yes    612
Name: count, dtype: int64


In [None]:
# Check for duplicate rows in the synthetic data
duplicate_rows = balance.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

Number of duplicate rows: 0


In [None]:
balance.shape

(1224, 31)

In [None]:
  # Save cleaned dataset to CSV
balance.to_csv('clean_synthetic_balance_data.csv', index=False)