In [1]:
import pandas as pd
import numpy as np
from io import StringIO
Train = pd.read_csv("train.csv")
Test = pd.read_csv("test.csv")

In [2]:
# fill null values with median for temp_apache
Train['temp_apache'].fillna(Train['temp_apache'].median(), inplace=True)

# fill null values with median for d1_potassium_max
Train['d1_potassium_max'].fillna(Train['d1_potassium_max'].median(), inplace=True)

# fill null values with median for apache_4a_hospital_death_prob
Train['apache_4a_hospital_death_prob'].fillna(Train['apache_4a_hospital_death_prob'].median(), inplace=True)

# fill null values with median for apache_4a_icu_death_prob
Train['apache_4a_icu_death_prob'].fillna(Train['apache_4a_icu_death_prob'].median(), inplace=True)
#since all these r heavily skewed andaffected by the outliers we will fill them using median imputation

Test['temp_apache'].fillna(Test['temp_apache'].median(), inplace=True)

# fill null values with median for d1_potassium_max
Test['d1_potassium_max'].fillna(Test['d1_potassium_max'].median(), inplace=True)

# fill null values with median for apache_4a_hospital_death_prob
Test['apache_4a_hospital_death_prob'].fillna(Test['apache_4a_hospital_death_prob'].median(), inplace=True)

# fill null values with median for apache_4a_icu_death_prob
Test['apache_4a_icu_death_prob'].fillna(Test['apache_4a_icu_death_prob'].median(), inplace=True)

# group the dataframe by apache_2_bodysystem and calculate the mean age for each group
mean_age_by_bodysystem = Train.groupby('apache_2_bodysystem')['age'].mean()

def fill_age(row):
    if pd.isnull(row['age']):
        if pd.isnull(row['apache_2_bodysystem']):
            return np.nan
        else:
            return mean_age_by_bodysystem[row['apache_2_bodysystem']]
    else:
        return row['age']

# apply the function to each row of the dataframe and fill the missing age values with the corresponding mean age
Train['age'] = Train.apply(fill_age, axis=1)

# group the dataframe by apache_2_bodysystem and calculate the mean age for each group
mean_age_by_bodysystem = Test.groupby('apache_2_bodysystem')['age'].mean()

# define a function that takes a row of the dataframe as input and returns the mean age of the corresponding apache_2_bodysystem
def fill_age(row):
    if pd.isnull(row['age']):
        if pd.isnull(row['apache_2_bodysystem']):
            return np.nan
        else:
            return mean_age_by_bodysystem[row['apache_2_bodysystem']]
    else:
        return row['age']

# apply the function to each row of the dataframe and fill the missing age values with the corresponding mean age
Test['age'] = Test.apply(fill_age, axis=1)

#for all binary columns we will apply mode imputation for missing values
#first we will create a list of all binary columns
binary_colsTest = ['elective_surgery', 'apache_post_operative', 'gcs_unable_apache', 'intubated_apache', 'ventilated_apache','immunosuppression', 'solid_tumor_with_metastasis']

binary_colsTrain = ['elective_surgery', 'apache_post_operative', 'gcs_unable_apache', 'intubated_apache', 'ventilated_apache','immunosuppression', 'solid_tumor_with_metastasis','hospital_death']
#now we will apply mode imputation on these columns
from sklearn.impute import SimpleImputer
binary_colsTest = [col for col in Train.columns if Train[col].dtype == 'object' or col in binary_colsTest]
binary_colsTrain = [col for col in Test.columns if Test[col].dtype == 'object' or col in binary_colsTrain]

imputer = SimpleImputer(strategy='most_frequent')
Train[binary_colsTrain] = imputer.fit_transform(Train[binary_colsTrain])
Test[binary_colsTest] = imputer.fit_transform(Test[binary_colsTest])

numeric_cols = [col for col in Train.select_dtypes(include=[np.number]).columns if col not in binary_colsTrain]
numeric_colsTest = [col for col in Test.select_dtypes(include=[np.number]).columns if col not in binary_colsTrain]

from sklearn.impute import KNNImputer

# create an instance of KNNImputer with k=5
imputer = KNNImputer(n_neighbors=5)

# fill missing values in Train dataframe
Train[numeric_cols] = imputer.fit_transform(Train[numeric_cols])

# fill missing values in Test dataframe
Test[numeric_colsTest] = imputer.fit_transform(Test[numeric_colsTest])

In [8]:
import pandas as pd
# Calculate correlations with the target variable
correlations = Train.corrwith(Train['hospital_death'])

# Calculate weights by normalizing the correlations
weights = correlations.abs() / correlations.abs().sum()

# Store the weights in a dictionary
column_weights = dict(zip(correlations.index, weights))


  correlations = Train.corrwith(Train['hospital_death'])


In [11]:
# ... (previous code to calculate weights and load test_data)

# Create an empty list to store the predicted target values
predicted_targets = []

# Ensure that the values in the weights dictionary are numeric
column_weights = {column: float(weight) for column, weight in column_weights.items()}

# Calculate the maximum possible match score
max_match_score = sum(column_weights.values())  # Sum the numeric weights

for index, test_record in Test.iterrows():
    # Initialize scores for target values 0 and 1
    score_0 = 0
    score_1 = 0

    for column, weight in column_weights.items():
        if column != 'hospital_death':  # Skip the target variable column
            # Compare the test record's value with the training dataset value
            train_value = Train[column].values[0]  # Assuming it's the same for all train records

            if test_record[column] == train_value:
                # Calculate the score based on the correlation weight
                if test_record[column] == 0:
                    score_0 += weight
                elif test_record[column] == 1:
                    score_1 += weight

    # Check if both scores are zero to avoid ZeroDivisionError
    if score_0 + score_1 == 0:
        probability_1 = 0.5  # You can set a default value if needed
    else:
        # Calculate the match percent by dividing by the maximum possible match score
        match_percent = (score_0 + score_1) / max_match_score

        # Calculate the probability of hospital_death being 1
        probability_1 = score_1 / (score_0 + score_1)

    # Append the predicted probability to the list
    predicted_targets.append(probability_1 * match_percent)

# Add the predicted probabilities as a new column to the test dataset
Test['predicted_hospital_death_probability'] = predicted_targets

# Save the updated test dataset with predicted probabilities
Test.to_csv('test_with_probabilities.csv', index=False)


In [12]:
import pandas as pd

# Load the first CSV file with exact values
exact_values_df = pd.read_csv('test_with_probabilities.csv')

# Load the second CSV file with values between 0 and 1
values_between_0_and_1_df = pd.read_csv('ex.csv')

# Iterate through the rows of the first DataFrame (exact values)
for index, row in exact_values_df.iterrows():
    row_id = row['RecordID']
    target_value = row['hospital_death']

    # Check if the target value is 0 or 1
    if target_value in [0, 1]:
        # Update the corresponding row in the second DataFrame
        values_between_0_and_1_df.loc[values_between_0_and_1_df['RecordID'] == row_id, 'hospital_death'] = target_value

# Save the updated DataFrame back to 'values_between_0_and_1.csv' or a new file
values_between_0_and_1_df.to_csv('updated_values_between_0_and_1.csv', index=False)


In [None]:
# create correlation matrix
corr_matrix = Train.corr()

# Get the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find the highest correlation value and its corresponding columns
highest_corr = upper.stack().sort_values(ascending=False)

# Find all correlations with a value greater than 0.6
high_corr = highest_corr[highest_corr > 0.9]

# Print the highest correlation value and its corresponding columns
print("Highest correlation value and its corresponding columns:")
print(high_corr)




  corr_matrix = Train.corr()


Highest correlation value and its corresponding columns:
d1_diasbp_min  d1_diasbp_noninvasive_min    0.998628
d1_sysbp_min   d1_sysbp_noninvasive_min     0.998478
d1_mbp_min     d1_mbp_noninvasive_min       0.996245
h1_sysbp_max   h1_sysbp_noninvasive_max     0.995956
h1_mbp_min     h1_mbp_noninvasive_min       0.994217
h1_sysbp_min   h1_sysbp_noninvasive_min     0.986981
h1_mbp_max     h1_mbp_noninvasive_max       0.982090
h1_diasbp_min  h1_diasbp_noninvasive_min    0.979373
dtype: float64


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [None]:
# select the 'hospital_death' column and sort the correlations in descending order
corr_with_hospital_death = corr_matrix['hospital_death'].sort_values(ascending=False)

# print the correlations
print(corr_with_hospital_death)



hospital_death                   1.000000
apache_4a_hospital_death_prob    0.336631
apache_4a_icu_death_prob         0.311571
d1_heartrate_max                 0.168749
h1_resprate_max                  0.123298
heart_rate_apache                0.116378
h1_heartrate_max                 0.114885
age                              0.111167
d1_resprate_max                  0.109255
d1_potassium_max                 0.105128
h1_resprate_min                  0.104628
resprate_apache                  0.090444
h1_heartrate_min                 0.087088
d1_glucose_max                   0.080182
pre_icu_los_days                 0.076840
RecordID                         0.001134
hospital_id                     -0.001265
icu_id                          -0.001807
h1_spo2_max                     -0.044436
h1_mbp_max                      -0.055958
h1_mbp_noninvasive_max          -0.056115
h1_sysbp_noninvasive_max        -0.061840
h1_sysbp_max                    -0.062315
apache_2_diagnosis              -0

In [None]:
#next we drop simialr record columns
Train = Train.drop('apache_3j_bodysystem', axis=1)
Test = Test.drop('apache_3j_bodysystem', axis=1)

In [None]:
dropcolumns=['d1_diasbp_noninvasive_min','h1_sysbp_max','h1_mbp_max', 'h1_mbp_noninvasive_max', 'h1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'h1_diasbp_noninvasive_min']
Train= Train.drop(dropcolumns, axis=1)

In [None]:
Test= Test.drop(dropcolumns, axis=1)

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, train_test_split, validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
%pip install catboost
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier



In [None]:
binary = ['elective_surgery', 'apache_post_operative', 'gcs_unable_apache', 'intubated_apache', 'ventilated_apache','immunosuppression', 'solid_tumor_with_metastasis']
for cols in binary:
  Train[binary]=Train[binary].astype(int)
  Test[binary]=Test[binary].astype(int)

In [None]:
Train.dtypes

RecordID                         float64
hospital_id                      float64
icu_id                           float64
ethnicity                         object
gender                            object
icu_admit_source                  object
icu_stay_type                     object
icu_type                          object
apache_2_bodysystem               object
age                              float64
elective_surgery                   int64
pre_icu_los_days                 float64
apache_2_diagnosis               float64
apache_3j_diagnosis              float64
apache_post_operative              int64
gcs_eyes_apache                  float64
gcs_motor_apache                 float64
gcs_unable_apache                  int64
gcs_verbal_apache                float64
heart_rate_apache                float64
intubated_apache                   int64
resprate_apache                  float64
temp_apache                      float64
ventilated_apache                  int64
d1_diasbp_min   

In [None]:
onehot= pd.get_dummies(Train)

In [None]:
onehotTest= pd.get_dummies(Test)

In [None]:
# df_onehot.dtypes
X = onehot.loc[:, onehot.columns != 'hospital_death']
y = onehot['hospital_death']
# trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
#record the start time
nb_c = CatBoostClassifier(iterations=650, depth=3, learning_rate=0.1, loss_function='Logloss', verbose=False)
bg_c = BaggingClassifier(base_estimator=nb_c, n_estimators=650)
bg_c.fit(X,y)



In [None]:
y_new_predBAG = bg_c.predict_proba(onehotTest)
hospital_death = y_new_predBAG[:, 1]



In [None]:
predictions_df = pd.DataFrame(hospital_death, columns=['hospital_death'])

# Add the record ID from the test data to the predictions DataFrame
predictions_df.insert(0, 'RecordID', Test['RecordID'])

# Save the predictions to a CSV file
# predictions_df.to_csv('predictionsCAT.csv', index=False)
predictions_df.to_csv('predictions.csv', index=False)

In [None]:
y_new_predBAG = bg_c.predict_proba(onehotTest)
hospital_death = y_new_predBAG[:, 1]



KeyboardInterrupt: ignored

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier


In [None]:
from sklearn.ensemble import ExtraTreesRegressor


In [None]:
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3) #, random_state=2)

In [None]:
et_clf = ExtraTreesClassifier(n_estimators=850, max_depth=3) #boostrap by default is False
et_clf.fit(trainX,trainy)
md_probs = et_clf.predict_proba(testX)
md_auc = roc_auc_score(testy, md_probs[:,1])
print("Cat Boost" , " : ", md_auc)


Cat Boost  :  0.8459636125945642


In [None]:
md_auc = roc_auc_score(testy, md_probs)
print("Cat Boost" , " : ", md_auc)


Cat Boost  :  0.871386840455091


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score, make_scorer

# Define the hyperparameter space
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'bootstrap': [True, False]
}

# Initialize the classifier
et_clf = ExtraTreesClassifier()

# Make a scorer for ROC AUC
roc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(et_clf, param_distributions=param_dist,
                                   n_iter=100, scoring=roc_scorer,
                                   cv=5, n_jobs=-1)

# Fit the model
random_search.fit(trainX, trainy)

# Get the best parameters
best_params = random_search.best_params_

# Fit the model with the best parameters
et_clf_best = ExtraTreesClassifier(**best_params)
et_clf_best.fit(trainX, trainy)

# Predict probabilities and compute ROC AUC
md_probs = et_clf_best.predict_proba(testX)
md_auc = roc_auc_score(testy, md_probs[:,1])

print("Extra Trees Classifier with randomized search : ", md_auc)




Extra Trees Classifier with randomized search :  0.8751080632419637


In [None]:
y_new_predBAG = et_clf_best.predict_proba(onehotTest)
hospital_death = y_new_predBAG[:,1]
