In [2]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load your dataset
df = pd.read_csv('day_28_flg_random_oversampled_data.csv')

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=10.0, impute=True, impute_strategy='median'):
        self.thresh = thresh
        if impute:
            self.imputer = SimpleImputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        dropped = True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]

            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X.columns[maxloc]} with vif={max_vif}')
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped = True
        return X

# Load your dataset
df = pd.read_csv('day_28_flg_random_oversampled_data.csv')

# Drop the target variable
X = df.drop(columns=['day_28_flg'])

# Handle categorical variables (optional)
# X = pd.get_dummies(X, drop_first=True)

# Initialize and apply ReduceVIF
vif_reducer = ReduceVIF(thresh=10.0)  # Set VIF threshold
X_reduced = vif_reducer.fit_transform(X)

# Display the remaining features
print("Remaining features after reducing multicollinearity:")
print(X_reduced.columns)

Dropping sodium_first with vif=1827.557449030976
Dropping temp_1st with vif=378.67512896249275
Dropping spo2_1st with vif=202.63868135913725
Dropping chloride_first with vif=93.37663128486709
Dropping weight_first with vif=52.527956219363055
Dropping hgb_first with vif=36.69208053372291
Dropping tco2_first with vif=30.711784627528647
Dropping potassium_first with vif=26.162793552248917
Dropping map_1st with vif=24.651538222956862
Dropping sapsi_first with vif=21.35633830011284
Dropping hr_1st with vif=16.79645613380842
Dropping age with vif=13.205050902168868
Dropping bmi with vif=12.310023100345578
Remaining features after reducing multicollinearity:
Index(['aline_flg', 'icu_los_day', 'hospital_los_day', 'gender_num',
       'sofa_first', 'service_num', 'renal_flg', 'liver_flg', 'copd_flg',
       'cad_flg', 'stroke_flg', 'mal_flg', 'resp_flg', 'abg_count',
       'wbc_first', 'platelet_first', 'bun_first', 'creatinine_first',
       'po2_first', 'pco2_first', 'iv_day_1'],
      dtype

In [3]:

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer

# Load your dataset
df = pd.read_csv('day_28_flg_random_oversampled_data.csv')

X = df[['sofa_first', 'age', 'stroke_flg', 'aline_flg', 'icu_los_day', 'hospital_los_day', 'gender_num',
        'service_num', 'renal_flg', 'liver_flg', 'copd_flg', 'cad_flg', 'mal_flg', 'resp_flg', 'abg_count',
        'wbc_first', 'platelet_first', 'bun_first', 'creatinine_first', 'po2_first', 'pco2_first', 'iv_day_1']]

# Impute missing values if necessary (using median by default)
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X_imputed.columns
vif_data['VIF'] = [variance_inflation_factor(X_imputed.values, i) for i in range(X_imputed.shape[1])]

# Identify columns with 'inf' VIF (indicating perfect multicollinearity)
columns_to_drop = vif_data[vif_data['VIF'] == float('inf')]['Feature'].tolist()

# Drop columns with high multicollinearity (inf VIFs)
X_reduced = X_imputed.drop(columns=columns_to_drop)

# Calculate the VIF for the remaining features after dropping collinear variables
vif_data_reduced = pd.DataFrame()
vif_data_reduced['Feature'] = X_reduced.columns
vif_data_reduced['VIF'] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]

# Display the new VIF results after removing collinear variables
print("\nVIF Results:")
print(vif_data_reduced)


VIF Results:
             Feature        VIF
0         sofa_first  10.632154
1                age  13.008582
2         stroke_flg   1.895595
3          aline_flg   3.551983
4        icu_los_day   6.140994
5   hospital_los_day   3.022562
6         gender_num   2.352393
7        service_num   3.564287
8          renal_flg   1.356933
9          liver_flg   1.277833
10          copd_flg   1.303004
11           cad_flg   1.319335
12           mal_flg   1.328079
13          resp_flg   2.242273
14         abg_count   4.362313
15         wbc_first   4.595808
16    platelet_first   6.944487
17         bun_first   5.437033
18  creatinine_first   4.015746
19         po2_first   3.625915
20        pco2_first   8.333608
21          iv_day_1   2.067725


In [None]:
# Select only the SOFA and age columns
Y = df[['sofa_first', 'age']]

# Calculate VIF for each variable
vif_data = pd.DataFrame()
vif_data['Feature'] = Y.columns
vif_data['VIF'] = [variance_inflation_factor(Y.values, i) for i in range(Y.shape[1])]

print(vif_data)

      Feature       VIF
0  sofa_first  5.059464
1         age  5.059464


In [None]:
# Drop the 'age' column from the dataset
X_reduced = X_imputed.drop(columns=['age'])

# Calculate the VIF for the remaining features after dropping 'age'
vif_data_reduced = pd.DataFrame()
vif_data_reduced['Feature'] = X_reduced.columns
vif_data_reduced['VIF'] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]

print("\nVIF Results After Dropping 'Age':")
print(vif_data_reduced)



VIF Results After Dropping 'Age':
             Feature       VIF
0         sofa_first  9.829621
1         stroke_flg  1.680399
2          aline_flg  3.543908
3        icu_los_day  6.072475
4   hospital_los_day  3.019758
5         gender_num  2.313016
6        service_num  3.427984
7          renal_flg  1.356477
8          liver_flg  1.273617
9           copd_flg  1.281705
10           cad_flg  1.274513
11           mal_flg  1.315228
12          resp_flg  2.215412
13         abg_count  4.289382
14         wbc_first  4.594502
15    platelet_first  6.710068
16         bun_first  4.814494
17  creatinine_first  3.951118
18         po2_first  3.518991
19        pco2_first  7.591286
20          iv_day_1  2.059523
