In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
filename = 'cleaned_df.csv'

df = pd.read_csv(filename)

df = df.drop(columns='Unnamed: 0', axis=1)

df = df.rename(columns={'charges': 'label'})

#move label to the first position
first_column = df.pop('label') 
df.insert(0, 'label', first_column) 

print('Shape', df.shape)
df.head()

Shape (8932, 50)


Unnamed: 0,label,age,death,sex,hospdead,slos,d.time,num.co,edu,scoma,...,dzgroup_Lung Cancer,dzgroup_MOSF w/Malig,ca_metastatic,ca_no,ca_yes,income_$11-$25k,income_$25-$50k,income_>$50k,income_under $11k,income_unknown
0,9715.0,62.84998,0,0,0,5,2029,0,11.0,0.0,...,1,0,1,0,0,1,0,0,0,0
1,34496.0,60.33899,1,1,1,4,4,2,12.0,44.0,...,0,0,0,1,0,1,0,0,0,0
2,41094.0,52.74698,1,1,0,17,47,2,12.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,3075.0,42.38498,1,1,0,3,133,2,11.0,0.0,...,1,0,1,0,0,0,0,0,1,0
4,50127.0,79.88495,0,1,0,16,2029,1,11.744707,26.0,...,0,0,0,1,0,0,0,0,0,1


Figuring out what features to use for predicting in two steps:
1. Using correlation
2. Using Random Forest Classifier

In [20]:

# Calculate the correlation with 'label'
correlation_with_label = df.corrwith(df['label']).to_frame()
correlation_with_label.columns = ['Correlation with label']

# Drop the 'label' column from the correlation results
correlation_with_label = correlation_with_label.drop(labels=['label'])

# Sort the DataFrame by the 'Correlation with label' column in descending order
top_corr_features = correlation_with_label.sort_values(by='Correlation with label', ascending=False).head(15)

print(top_corr_features)


                           Correlation with label
slos                                     0.641425
dnrday                                   0.620256
hday                                     0.476984
avtisst                                  0.449089
dzgroup_ARF/MOSF w/Sepsis                0.333557
aps                                      0.311925
sps                                      0.264400
hospdead                                 0.187240
bili                                     0.185234
scoma                                    0.137049
hrt                                      0.129735
temp                                     0.127782
ca_no                                    0.114627
edu                                      0.100148
crea                                     0.090835


In [10]:
"""Using Random forest regressor"""

from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# Assuming df is your DataFrame and 'label' is your target variable
X = df.drop('label', axis=1)  # Features
y = df['label']               # Target

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to your data
rf_regressor.fit(X, y)

# Get feature importances
feature_importances = rf_regressor.feature_importances_

# Create a Series for feature importances
rf_features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)

# Select the top 15 features
top_rf_features = rf_features.head(15)

# Print the feature importances
print("Top 15 features according to Random Forest Regressor:")
print(top_rf_features)


Top 15 features according to Random Forest Regressor:
slos       0.379858
hday       0.157060
avtisst    0.066628
dnrday     0.047744
bili       0.037784
bun        0.024863
sps        0.024334
scoma      0.023022
aps        0.020364
age        0.019601
ph         0.016606
urine      0.016107
sod        0.015497
wblc       0.014276
d.time     0.012362
dtype: float64


In [25]:
#features mixed from two methods correlation check and random forest

rf_feature_list_regression = top_rf_features.index.tolist()
corr_feature_list_regression = top_corr_features.index.tolist()

print(rf_feature_list_regression)
print(corr_feature_list_regression)

combined_features_regression = list(set(rf_feature_list_regression) | set(corr_feature_list_regression))  # Union

final_features_regression = combined_features_regression[:15]

print("Set of features for predicting: ", final_features_regression)
print("Set with more features for improving model if needed: ", combined_features_regression)

['slos', 'hday', 'avtisst', 'dnrday', 'bili', 'bun', 'sps', 'scoma', 'aps', 'age', 'ph', 'urine', 'sod', 'wblc', 'd.time']
['slos', 'dnrday', 'hday', 'avtisst', 'dzgroup_ARF/MOSF w/Sepsis', 'aps', 'sps', 'hospdead', 'bili', 'scoma', 'hrt', 'temp', 'ca_no', 'edu', 'crea']
Set of features for predicting:  ['hday', 'dnrday', 'scoma', 'dzgroup_ARF/MOSF w/Sepsis', 'edu', 'bili', 'slos', 'ca_no', 'wblc', 'age', 'crea', 'temp', 'ph', 'hrt', 'hospdead']
Set with more features for improving model if needed:  ['hday', 'dnrday', 'scoma', 'dzgroup_ARF/MOSF w/Sepsis', 'edu', 'bili', 'slos', 'ca_no', 'wblc', 'age', 'crea', 'temp', 'ph', 'hrt', 'hospdead', 'sps', 'avtisst', 'urine', 'aps', 'd.time', 'bun', 'sod']
