In [1]:
pip install catboost


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [2]:

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


In [6]:

# Read the data
df_train = pd.read_csv("training.csv")
df_test = pd.read_csv("test.csv")

# Drop patient_id column
df_train = df_train.drop("patient_id", axis=1)
df_test = df_test.drop("patient_id", axis=1)

numerical_cols = df_train.select_dtypes(exclude=['object']).columns
categorical_columns = df_train.select_dtypes(include=['object']).columns

# Impute categorical columns using mode
for col in categorical_columns:
    if col != 'DiagPeriodL90D':
        mode = df_train[col].mode()[0]
        df_train[col].fillna(mode, inplace=True)
        df_test[col].fillna(mode, inplace=True)

# Impute numerical columns using mean
for col in numerical_cols:
    if col != 'DiagPeriodL90D':
        mean = df_train[col].median()
        df_train[col].fillna(mean, inplace=True)
        df_test[col].fillna(mean, inplace=True)

# Encoding categorical columns
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
for col in categorical_columns.to_list() + ['patient_zip3']:
    encoder.fit(df_train[[col]])
    df_train[col] = encoder.transform(df_train[[col]])
    df_test[col] = encoder.transform(df_test[[col]])

cols = ['breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3', 'patient_age', 'payer_type',
        'patient_state', 'Region', 'Division', 'health_uninsured', 'Ozone']

X_train = df_train[cols]
y_train = df_train['DiagPeriodL90D']

# Model parameters
params = {
    'depth': 2, # dont change this
    'random_state': 42,
    'eval_metric': 'AUC',
    'verbose': False,
    'loss_function': 'Logloss',
    'learning_rate': 0.1, # 0.815 was obtained at 0.3, to be experimented for further improvement
    'iterations': 1000, # dont change this
    'l2_leaf_reg': 3
}

# Cross-validation settings
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store AUC scores
auc_scores = []
test_preds = []

for train_idx, test_idx in cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Initialize CatBoost classifier
    model = CatBoostClassifier(**params)

    # Train the model
    model.fit(X_train_fold, y_train_fold, eval_set=(X_test_fold, y_test_fold), use_best_model=True)

    # Make predictions
    preds = model.predict_proba(X_test_fold)[:, 1]
    preds_test = model.predict_proba(df_test[cols])[:, 1]
    test_preds.append(preds_test)

    # Calculate AUC score
    auc_score = roc_auc_score(y_test_fold, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

# Print average AUC score
print(f"Average AUC Score: {np.mean(auc_scores)}")

# Take the average of predictions for the test data
test_predictions = np.mean(test_preds, axis=0)

# # Convert predictions to binary using classification threshold
# test_predictions_binary = [1 if prob >= 0.5 else 0 for prob in test_predictions]

# # Print the distribution of predictions
# print(pd.Series(test_predictions_binary).value_counts())


AUC Score: 0.812538372003786
AUC Score: 0.8015835823685619
AUC Score: 0.8023428517286502
AUC Score: 0.8102834904367912
AUC Score: 0.805649450586033
Average AUC Score: 0.8064795494247644


In [7]:
df_predictions = pd.DataFrame(test_predictions, columns=['predictions'])

df_test = pd.read_csv("test.csv")
df_predictions = pd.DataFrame({
    'patient_id': df_test['patient_id'],
    'predictions': test_predictions
})

df_predictions

df_predictions.to_csv('predictions.csv', index=False)


In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12906 entries, 0 to 12905
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   breast_cancer_diagnosis_code      12906 non-null  float64
 1   metastatic_cancer_diagnosis_code  12906 non-null  float64
 2   patient_zip3                      12906 non-null  float64
 3   patient_age                       12906 non-null  int64  
 4   payer_type                        12906 non-null  float64
 5   patient_state                     12906 non-null  float64
 6   Region                            12906 non-null  float64
 7   Division                          12906 non-null  float64
 8   health_uninsured                  12906 non-null  float64
 9   Ozone                             12906 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 1008.4 KB


In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [30]:

# Read the data
df_train = pd.read_csv("training.csv")
df_test = pd.read_csv("test.csv")

# Drop patient_id column
df_train = df_train.drop("patient_id", axis=1)
df_test = df_test.drop("patient_id", axis=1)

numerical_cols = df_train.select_dtypes(exclude=['object']).columns
categorical_columns = df_train.select_dtypes(include=['object']).columns

# Impute categorical columns using mode
for col in categorical_columns:
    if col != 'DiagPeriodL90D':
        mode = df_train[col].mode()[0]
        df_train[col].fillna(mode, inplace=True)
        df_test[col].fillna(mode, inplace=True)

# Impute numerical columns using mean
for col in numerical_cols:
    if col != 'DiagPeriodL90D':
        mean = df_train[col].median()
        df_train[col].fillna(mean, inplace=True)
        df_test[col].fillna(mean, inplace=True)

# Encoding categorical columns
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
for col in categorical_columns.to_list() + ['patient_zip3']:
    encoder.fit(df_train[[col]])
    df_train[col] = encoder.transform(df_train[[col]])
    df_test[col] = encoder.transform(df_test[[col]])

    
cols = ['breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3', 'patient_age', 'payer_type',
        'patient_state', 'Region', 'Division', 'health_uninsured', 'Ozone']

X_train = df_train[cols]
y_train = df_train['DiagPeriodL90D']
X_test = df_test[cols]

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model parameters
n_features = X_train_scaled.shape[1]

# Define the neural network architecture
model = Sequential([
    Dense(32, activation='relu', input_shape=(n_features,)),
    Dropout(0.3),
    Dense(18, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Cross-validation settings
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store AUC scores
auc_scores = []
test_preds = []

for train_idx, test_idx in cv.split(X_train_scaled, y_train):
    X_train_fold, X_test_fold = X_train_scaled[train_idx], X_train_scaled[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Initialize the model
    model = Sequential([
        Dense(32, activation='relu', input_shape=(n_features,)),
        Dropout(0.3),
        Dense(18, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

    # Define early stopping
    early_stopping = EarlyStopping(monitor='val_auc', patience=5, restore_best_weights=True)

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, validation_data=(X_test_fold, y_test_fold), callbacks=[early_stopping], verbose=0)

    # Make predictions
    preds = model.predict(X_test_fold).ravel()
    preds_test = model.predict(X_test_scaled).ravel()
    test_preds.append(preds_test)

    # Calculate AUC score
    auc_score = roc_auc_score(y_test_fold, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

# Print average AUC score
print(f"Average AUC Score: {np.mean(auc_scores)}")

# Take the average of predictions for the test data
test_predictions = np.mean(test_preds, axis=0)

AUC Score: 0.8070623672968202
AUC Score: 0.7881443226369822
AUC Score: 0.7838399823818778
AUC Score: 0.7876007984491955
AUC Score: 0.7866437093317149
Average AUC Score: 0.7906582360193182


In [21]:
df_predictions = pd.DataFrame(test_predictions, columns=['predictions'])

df_test = pd.read_csv("test.csv")
df_predictions = pd.DataFrame({
    'patient_id': df_test['patient_id'],
    'predictions': test_predictions
})

df_predictions

df_predictions.to_csv('predictions.csv', index=False)
