In [5]:
import os
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
%matplotlib inline     
sns.set(color_codes=True)

In [6]:
# TODO: change this to yours ...
DATA_DIR = "/Users/kwongtszkong/Desktop/STAT3612 Statistical ML/Group Project/Stat3612_Project_datasource"

# read 3 csv files and 1 pkl file
train_csv_file = os.path.join(DATA_DIR, "train.csv")
val_csv_file = os.path.join(DATA_DIR, "valid.csv")
test_csv_file = os.path.join(DATA_DIR, "test.csv")
ehr_pkl_file = os.path.join(DATA_DIR, "ehr_preprocessed_seq_by_day_cat_embedding.pkl")

train_df = pd.read_csv(train_csv_file)
val_df = pd.read_csv(val_csv_file)
test_df = pd.read_csv(test_csv_file)

with open(ehr_pkl_file, 'rb') as f:
    ehr_data = pd.read_pickle(f)

Initialize the X_train with mapping features

In [None]:

# Extract EHR features and IDs
ehr_features = ehr_data["feat_dict"]
ehr_ids = list(ehr_features.keys())

# Convert to DataFrame (each row is an admission)
ehr_df = pd.DataFrame({
    "id": ehr_ids,
    "ehr_matrix": [ehr_features[id] for id in ehr_ids]
})

# Merge with labels from train/val/test DataFrames
def merge_labels(df, ehr_df):
    return df[["id", "readmitted_within_30days"]].merge(ehr_df, on="id", how="inner")

train_ehr = merge_labels(train_df, ehr_df)
val_ehr = merge_labels(val_df, ehr_df)

def aggregate_ehr(df):
    # Aggregate features (example: mean over days)
    df["ehr_mean"] = df["ehr_matrix"].apply(lambda x: np.mean(x, axis=0))
    
    # Convert to numpy arrays
    X = np.stack(df["ehr_mean"].values)
    y = df["readmitted_within_30days"].astype(int).values
    
    return X, y

X_train, y_train = aggregate_ehr(train_ehr)
X_val, y_val = aggregate_ehr(val_ehr)
print(X_train.shape)
print(X_train)



(49451, 171)
[[52.  1.  6. ...  0.  0.  0.]
 [52.  1.  6. ...  0.  0.  0.]
 [52.  1.  6. ...  0.  0.  0.]
 ...
 [91.  1.  6. ...  0.  0.  1.]
 [69.  0.  6. ...  0.  0.  0.]
 [69.  0.  6. ...  0.  0.  0.]]
[0 0 0 ... 0 0 0]


Remove the non-informative features

In [8]:
from scipy.stats import pointbiserialr 

# Assuming X_train is your feature matrix (n_samples × n_features)
std_devs = np.std(X_train, axis=0)

# Get feature names from ehr_data (adjust if your structure differs)
feature_names = ehr_data["feature_cols"]  # Or ehr_data["feature_cols"] if available

# Create a DataFrame for analysis
std_df = pd.DataFrame({
    "Feature": feature_names,
    "Std_Dev": std_devs
})




# Initialize storage
correlations = []
p_values = []

# Calculate correlation for each feature
for i in range(X_train.shape[1]):
    if np.std(X_train[:, i]) == 0:  # Skip constant features
        correlations.append(0)
        p_values.append(1)
    else:
        corr, pval = pointbiserialr(X_train[:, i], y_train)
        correlations.append(corr)
        p_values.append(pval)

# Create results DataFrame
corr_df = pd.DataFrame({
    "Feature": feature_names,
    "Correlation": correlations,
    "P-value": p_values
})

# Sort by absolute correlation strength
corr_df["Abs_Correlation"] = np.abs(corr_df["Correlation"])
corr_df = corr_df.sort_values("Abs_Correlation", ascending=True)
print(corr_df[["Feature", "Correlation", "P-value"]].head(20))  # Top 20 features


                                               Feature  Correlation   P-value
44                                             M50-M54     0.000000  1.000000
80                                             K40-K46     0.000000  1.000000
39                                             A20-A28     0.000000  1.000000
17                                             N00-N08     0.000000  1.000000
34                                             E70-E88     0.000000  1.000000
72                                             J00-J06     0.000000  1.000000
111                              Basophils Joint Fluid     0.000000  1.000000
50                                             N25-N29     0.000000  1.000000
49                                             J30-J39     0.000000  1.000000
93                                             R90-R94     0.000000  1.000000
51                                             Q65-Q79     0.000000  1.000000
5                                              O85-O92     0.000

In [9]:

# Function to analyze zero-value ratio in features
def analyze_zeros(ehr_data, feature_cols, threshold=0.5):
    """Identify features where >50% zeros might indicate missingness."""
    all_zeros = []
    for feature_name, feature_idx in zip(feature_cols, range(len(feature_cols))):
        zeros_count = np.sum([np.sum(matrix[:, feature_idx] == 0) for matrix in ehr_data["feat_dict"].values()])
        total_values = sum([matrix.shape[0] for matrix in ehr_data["feat_dict"].values()])
        zero_ratio = zeros_count / total_values
        if zero_ratio > threshold:
            all_zeros.append((feature_name, zero_ratio))
    return pd.DataFrame(all_zeros, columns=["Feature", "Zero_Ratio"])

# Analyze zero-value ratio for each feature category
demo_zero_df = analyze_zeros(ehr_data, ehr_data["demo_cols"])
icd_zero_df = analyze_zeros(ehr_data, ehr_data["icd_cols"])
lab_zero_df = analyze_zeros(ehr_data, ehr_data["lab_cols"])
med_zero_df = analyze_zeros(ehr_data, ehr_data["med_cols"])

# Combine all zero-value data
zero_df = pd.concat([demo_zero_df, icd_zero_df, lab_zero_df, med_zero_df])

# Merge zero-value data with std and correlation data
combined_df = pd.merge(pd.merge(std_df, corr_df, on='Feature'), zero_df, on='Feature', how='left')



clinically_relevant = [
    'E70-E88', 'N00-N08', 'N17-N19', 'I30-I52', 'J40-J47',
    'A00-A09', 'J20-J22', 'Basophils Blood', 'Eosinophils Blood',
    'pH Urine', 'ANTIINFLAM.TUMOR NECROSIS FACTOR INHIBITING AGENTS',
    'ANTIPARASITICS'
]


  zero_df = pd.concat([demo_zero_df, icd_zero_df, lab_zero_df, med_zero_df])


In [None]:

thresholds = {
    'zero_ratio': {
        'icd': 0.95,    # ICD codes often sparse
        'med': 0.95,    # Medications often sparse
        'lab': 0.95,    # Labs should rarely be zero
        'demo': 0.9     # Demographics rarely zero
    },
    'std_dev': 0.05,    # Only for continuous features
    'correlation': {
        'min_abs_corr': 0.05,
        'max_pvalue': 0.05
    }
}

# Categorize features
feature_types = {
    'icd': ehr_data["icd_cols"],
    'med': ehr_data["med_cols"],
    'lab': ehr_data["lab_cols"],
    'demo': ehr_data["demo_cols"]
}

# Initialize storage
features_to_remove = []

# Check each feature type separately
for ftype, cols in feature_types.items():
    for feature in cols:
        row = combined_df[combined_df['Feature'] == feature].iloc[0]
        
        # Skip binary features for variance check
        check_variance = ftype not in ['icd']
        
        # Apply type-specific rules
        if (row['Zero_Ratio'] > thresholds['zero_ratio'][ftype]) and \
           (not check_variance or row['Std_Dev'] < thresholds['std_dev']) and \
           (np.abs(row['Correlation']) < thresholds['correlation']['min_abs_corr']) and \
           (row['P-value'] > thresholds['correlation']['max_pvalue']):
            features_to_remove.append(feature)

    
filtered_df = combined_df[~combined_df['Feature'].isin(clinically_relevant)]
# Generate report
removal_df = filtered_df[filtered_df['Feature'].isin(features_to_remove)].sort_values(
    by=['Zero_Ratio', 'Std_Dev', 'Abs_Correlation'],
    ascending=[False, True, True]
)

print(f"\nRecommended features to remove ({len(removal_df)} total):")
print(removal_df[['Feature', 'Zero_Ratio', 'Std_Dev', 'Correlation', 'P-value']])


Recommended features to remove (40 total):
                   Feature  Zero_Ratio   Std_Dev  Correlation   P-value
54                 K65-K68    0.999950  0.015576    -0.008536  0.057676
24                 R40-R46    0.999950  0.102142    -0.003603  0.423045
8                  M80-M85    0.999909  0.007789    -0.004268  0.342633
29                 B25-B34    0.999877  0.029816     0.004573  0.309156
26                 N20-N23    0.999863  0.008993    -0.004928  0.273169
73                 N60-N65    0.999831  0.025824     0.006283  0.162356
47                 G89-G99    0.999813  0.182042     0.007699  0.086893
44                 M50-M54    0.999786  0.000000     0.000000  1.000000
111  Basophils Joint Fluid    0.999758  0.000000     0.000000  1.000000
147         CONTRACEPTIVES    0.999758  0.010055    -0.005509  0.220519
57                 B65-B83    0.999735  0.006359    -0.003484  0.438443
80                 K40-K46    0.999667  0.000000     0.000000  1.000000
93                 R

In [14]:
# 1. Get indices of features to remove
features_to_remove = removal_df['Feature'].tolist()
all_features = ehr_data["feature_cols"]
remove_indices = [i for i, feature in enumerate(all_features) 
                 if feature in features_to_remove]

# 2. Function to remove features
def remove_features(X, remove_indices):
    return np.delete(X, remove_indices, axis=1)

# 3. Apply to both training and validation sets
X_train_filtered = remove_features(X_train, remove_indices)
X_val_filtered = remove_features(X_val, remove_indices)

# 4. Get remaining feature names
remaining_features = [f for i, f in enumerate(all_features) 
                     if i not in remove_indices]

# 5. Verification
print(f"Removed {len(remove_indices)} features.")
print(f"Original shape: {X_train.shape} -> New shape: {X_train_filtered.shape}")
print("\nSample removed features:", features_to_remove[:5])
print("\nSample remaining features:", remaining_features[:5])

Removed 40 features.
Original shape: (49451, 171) -> New shape: (49451, 131)

Sample removed features: ['K65-K68', 'R40-R46', 'M80-M85', 'B25-B34', 'N20-N23']

Sample remaining features: ['age', 'gender', 'ethnicity', 'Y90-Y99', 'G30-G32']
