**Purpose**

The purpose of this notebook is to give beginners an access point to learn about how to make an accessable model that someone new can begin to understand. Therefore, this notebook will be over commented to afford accessability. 

In [None]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder, RobustScaler
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
        
train = pd.read_csv('/kaggle/input/playground-series-s3e26/train.csv') #import training dataset
train = train.drop('id', axis = 1)
test = pd.read_csv('/kaggle/input/playground-series-s3e26/test.csv')
original_data = pd.read_csv('/kaggle/input/cirrhosis-patient-survival-prediction/cirrhosis.csv')[train.columns] 
train = pd.concat(objs=[train, original_data]).reset_index(drop=True)
train['Age'] = train['Age'] - train['N_Days']
# train['Age'] = train['Age'] / 365
# train = train.drop(['Drug', 'Sex'], axis = 1)
index = test['id']
test = test.drop('id', axis =1)
test['Age'] = test['Age'] - test['N_Days']
# test = test.drop(['Drug', 'Sex'], axis = 1)
train = train.dropna()
# feature_names = list(train.columns[:-1])

In [None]:
feature_names

In [None]:
train.head()

In [None]:
train = train.dropna()
train_df = train
test_df = test

In [None]:
train_df.head()

In [None]:
def engineer_features(data):
    col_drop = []
    normal_ranges = {
    'Bilirubin': (0.1, 1.2),  # Normal range of bilirubin in mg/dL
    'Alk_Phos': (44, 147),    # Normal range of alkaline phosphatase in IU/L
    'SGOT': (10, 40),         # Normal range of SGOT in U/L
    'Albumin': (3.5, 5.0)     # Normal range of albumin in g/dL
    }

    for test, (low, high) in normal_ranges.items():
        data[f'{test}_normalized'] = (data[test] - low) / (high - low)
        col_drop.append(f'{test}_normalized')
        

    data['Liver_Function_Composite'] = data[['Bilirubin_normalized', 'Alk_Phos_normalized', 'SGOT_normalized', 'Albumin_normalized']].mean(axis=1)
    data['Cholesterol_Tryglicerides_Ratio'] = data['Cholesterol'] / data['Tryglicerides']
    data['Prothrombin_Platelets_Ratio'] = data['Prothrombin'] / data['Platelets']
#     data['Edema_encoded'] = data['Edema'].map({'Y': 1, 'N': 0})
#     data['Albumin_Edema_Interaction'] = data['Albumin'] * data['Edema_encoded']
    data['Ascites_score'] = data['Ascites'].map({'Y': 1, 'N': 0})  # Assuming 'Y' for Yes and 'N' for No
    data['Bilirubin_score'] = (data['Bilirubin'] > 1.2).astype(int)  # Elevated bilirubin
    data['Albumin_score'] = (data['Albumin'] < 3.5).astype(int)  # Low albumin
    data['Platelets_score'] = (data['Platelets'] < 150).astype(int)  # Low platelets
    data['Alk_Phos_score'] = (data['Alk_Phos'] > 147).astype(int)  # Elevated Alk_Phos
    data['SGOT_score'] = (data['SGOT'] > 40).astype(int)  # Elevated SGOT
    data['Prothrombin_score'] = (data['Prothrombin'] > 12).astype(int)  # Prolonged prothrombin time
    data['Spiders_score'] = data['Spiders'].map({'Y': 1, 'N': 0})
    data['Cirrhosis_Composite_Score'] = data[['Ascites_score', 'Bilirubin_score', 'Albumin_score', 'Platelets_score', 'Alk_Phos_score', 'SGOT_score', 'Prothrombin_score', 'Spiders_score']].sum(axis=1)
    col_drop.extend(['Liver_Function_Composite','Cholesterol_Tryglicerides_Ratio','Prothrombin_Platelets_Ratio', 'Drug', 'Bilirubin', 'Ascites_score', 'Albumin_score', 'Platelets_score', 'Alk_Phos_score', 'SGOT_score', 'Prothrombin_score', 'Spiders_score'])
    data = data.drop(col_drop, axis = 1)    
    return data

# columns_wanted = ['Edema','Albumin','Cirrhosis_Composite_Score', 'Prothrombin_Platelets_Ratio', 'Age', 'Sex', 'N_Days', 'Drug', 'Hepatomegaly', 'Copper', 'Stage', 'Status']
# columns_wanted_test = ['Edema','Albumin','Cirrhosis_Composite_Score', 'Prothrombin_Platelets_Ratio', 'Age', 'Sex', 'N_Days', 'Drug', 'Hepatomegaly', 'Copper', 'Stage']
train_df = engineer_features(train)
test_df = engineer_features(test)
feature_names = [col for col in train_df.columns if col != 'Status']
# train_df = train_df[columns_wanted]
# test_df = test_df[columns_wanted_test]

In [None]:
feature_names

In [None]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [None]:
def  get_gmm_class_features(feat,n):
    gmm=GaussianMixture(n_components=n,random_state=42)
    gmm.fit(train_df[feat].fillna(train_df[feat].median()).values.reshape(-1,1))
    train_df[f'{feat}_class']=gmm.predict(train_df[feat].fillna(train_df[feat].median()).values.reshape(-1,1))
    test_df[f'{feat}_class']=gmm.predict(test_df[feat].fillna(test_df[feat].median()).values.reshape(-1,1))
    
get_gmm_class_features('Bilirubin',5)
get_gmm_class_features('Albumin',5)
get_gmm_class_features('SGOT',5)
get_gmm_class_features('Platelets',4)
get_gmm_class_features('Prothrombin',4)
get_gmm_class_features('Stage',4)
get_gmm_class_features('Cholesterol',4)
get_gmm_class_features('Alk_Phos',5)
get_gmm_class_features('Age',5)
get_gmm_class_features('Tryglicerides',4)
get_gmm_class_features('Copper',4)

In [None]:
train_df.dtypes

In [None]:
numerical_columns = [col for col in train_df.columns if train_df[col].dtype in ['int64','float64']]
categorical_columns = [col for col in train_df.columns if train_df[col].dtype in ['object', 'bool', 'category'] and col != 'Status']
categorical_columns.remove('Edema')

In [None]:
ordinal_transformer = Pipeline(steps=[
    ('ordinal_encoder', OrdinalEncoder())
])

onehot_transformer = Pipeline(steps=[
    ('onehot_encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers = [
        ('ohe', onehot_transformer, ['Edema']),
        ('ord', ordinal_transformer, categorical_columns),
    ]
)
cat_cols = ['Edema_N', 'Edema_S', 'Edema_Y'] + categorical_columns

train_df[cat_cols] = preprocessor.fit_transform(train_df)
test_df[cat_cols] = preprocessor.transform(test_df)
train_df.drop(columns = ['Edema'], inplace = True)
test_df.drop(columns = ['Edema'], inplace = True)

lbl = LabelEncoder()
train_df['Status'] = lbl.fit_transform(train_df['Status'])

In [None]:
numerical_transformer = Pipeline(steps=[
    ('scaling', RobustScaler())
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, numerical_columns),
        
    ]
)

train_df[numerical_columns] = preprocessor.fit_transform(train_df)
test_df[numerical_columns] = preprocessor.transform(test_df)

In [None]:
train_df.dtypes

**Exploratory Data Analysis**

In this section we will explore aspects of the data to learn how we should attack making the model. We are looking for features that are correlated to our labels as well as any features that will need to be cleaned for missing values. 

In [None]:
# Here we look at our data to see what is in our dataframe 
train_df.head()

In [None]:
# get the basic statistics of the numerical columns of the training dataset
train.describe()

Results of preliminary analysis: 
1. Some features will need to be scaled because they have difference ranges
2. Some of the features have a large std meaning they likely won't make good features. I might consider dropping those features later. 
3. The histograms in the data section of this competition give a much better breakdown of the data than I did here. I would encourage you to go and check out the data here: https://www.kaggle.com/competitions/playground-series-s3e26/data?select=train.csv

In [None]:
# Preparing the data for a correlation matrix analysis by encoding the catagorical
# into numerical features

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Change categorical values to numerical values
for column in train_df.columns:
    if train_df[column].dtype in ['object', 'bool', 'category']:  # Check if the column contains categorical data
        train_df[column] = label_encoder.fit_transform(train_df[column])
        if column in test.columns:
            test_df[column] = label_encoder.transform(test_df[column])

        
# Show the dataframe
train_df.head()

In [None]:
test.head()

In [None]:
# Create a correlation matrix using pandas
corr_matrix = train_df.corr()
# use seaborn to make a heatmap of the corr_matrix
sn.heatmap(corr_matrix)
# show the plot using matplotlib
plt.show()

In [None]:
# I had a hard time differentiating between the colors, so I printed out
# the correlations so I could better understand the data. The last row of the 
# correlation matrix has our correlations between our status and every other
# feature.
print(corr_matrix.iloc[-1,:])

Based on this analysis the following features are canidates for using in our model: Ascites, Hepatomegaly, Spiders, Edema, Bilirubin, Copper, Albumin, SGOT, Prothrombin, and Stage 

Next I need to invetigate what type of relationship these variables have with the target. 

In [None]:
x = train_df['Status']
y = train_df['N_Days']
slope, intercept = np.polyfit(x, y, 1)
plt.scatter(x, y)
plt.plot(x, slope * x + intercept, color='red', label='Line of Best Fit')
plt.show()

Types of relationships: 

Status and Hepatomegaly: non linear

In [None]:
y = train_df['Status']
X = train_df.drop(['Status'], axis=1)
scaler = RobustScaler()
X = scaler.fit_transform(X)
# pca = PCA(n_components=2)  # Adjust n_components as needed
# X = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

Simple linear model build and tested with our training set

In [None]:
# test_df = test_df.drop('id', axis=1)
X_test_real = scaler.transform(test_df)
# X_test_real = test_df.drop('id', axis=1)

In [None]:
train_df.isna().sum()

In [None]:
file_path = 'submission.csv'
df.to_csv(file_path) 

Here we create a simple random forest ensemble

In [None]:
def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
#     max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
#     min_samples_split = trial.suggest_float("min_samples_split", 0.1, 1.0)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 50)

    # Initialize the Random Forest classifier with hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict probabilities on the validation set
    y_prob = clf.predict_proba(X_test)

    # Calculate log loss as the metric to optimize
    logloss = log_loss(y_test, y_prob)

    return logloss

study = optuna.create_study(direction="minimize")  # or "minimize" for a different metric
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Print the best hyperparameters and score
best_params = study.best_params
best_score = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

In [None]:
random_forest_model = RandomForestClassifier(n_estimators=154, min_samples_leaf=3, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_forest_val = random_forest_model.predict_proba(X_test)
accuracy = log_loss(y_test, y_pred_forest_val)
print(f'Accuracy: {accuracy}')

In [None]:
y_pred_forest = random_forest_model.predict_proba(X_test_real)
df = pd.DataFrame(y_pred_forest, columns=['Status_C', 'Status_CL', 'Status_D'], index=index)
df.head()

In [None]:
file_path = 'submission_forest.csv'
df.to_csv(file_path) 

To do: 
1. Create loss function to test models before submission.... CHECK
2. Try out a neural network to see if it will outperform a tree classifier
    a. My guess is that it should
3. I should also scale the data and rerun the log regression. 
4. Develop a cross validation strategy
5. develop function that will optomize learning parameters

In [None]:
num_classes = 3 

params = {
    'objective': 'multi:softprob',  # Multi-class classification
    'eval_metric': 'mlogloss',     # Evaluation metric (log loss)
    'num_class': num_classes,      # Number of classes
    'max_depth': 4,                # Maximum depth of trees
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Create a DMatrix from your training data
dtrain = xgb.DMatrix(X_train, label=y_train)

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=100)

dtest = xgb.DMatrix(X_test)

# Make predictions
predictions_tree = model.predict(dtest)


In [None]:
print(predictions_tree)

In [None]:
accuracy = log_loss(y_test, predictions_tree)
print(f'Log Loss: {accuracy}')

In [None]:
xtest = xgb.DMatrix(X_test_real)
predictions_test = model.predict(xtest)
df_xboost = pd.DataFrame(predictions_test, columns=['Status_C', 'Status_CL', 'Status_D'], index=test['id'])
df_xboost.head()

In [None]:
import optuna

def objective(trial):
    params = {
        "objective": "multi_logloss",
        "n_estimators": trial.suggest_int('n_estimators', 500, 750),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0), 
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0), 
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    # Build the xgboost model
    optuna_xgbmodel = xgb.XGBClassifier(**params,
                                    random_state=42)
    
    optuna_xgbmodel.fit(X_train, y_train)
    y_pred_probs = optuna_xgbmodel.predict_proba(X_test)
    logloss = log_loss(y_test, y_pred_probs)
    return logloss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print(study.best_params)

{'n_estimators': 666, 'learning_rate': 0.09792447281018261, 'max_depth': 3, 'subsample': 0.8802526465250708, 'colsample_bytree': 0.12328759472404174, 'min_child_weight': 9} with feature engineering

old params 
'objective': 'multi:softprob',  # Multi-class classification
    'eval_metric': 'mlogloss',     # Evaluation metric (log loss)
    'num_class': num_classes,      # Number of classes
    'max_depth': 10,                # Maximum depth of trees
    'learning_rate': 0.03970804555039867,
    'n_estimators': 572,
    'subsample': 0.5048914610683402,
    'colsample_bytree': 0.1299678713013594,
    'min_child_weight': 14,
    'enable_categorical': True

In [None]:
num_classes = 3 

params = {
    'objective': 'multi:softprob',  # Multi-class classification
    'eval_metric': 'mlogloss',     # Evaluation metric (log loss)
    'num_class': num_classes,      # Number of classes
    'max_depth': 8,                # Maximum depth of trees
    'learning_rate': 0.03733937019630027,
    'n_estimators': 707,
    'subsample': 0.7713822447289903,
    'colsample_bytree': 0.11897302935991551,
    'min_child_weight': 14,
    
}


# Train the XGBoost model
model_2 = xgb.XGBClassifier(**params, random_state=42)
model_2.fit(X_train, y_train)

# Make predictions
predictions_xgb_test = model_2.predict_proba(X_test)

params = {
    'objective': 'multi:softprob',  # Multi-class classification
    'eval_metric': 'mlogloss',     # Evaluation metric (log loss)
    'num_class': num_classes,      # Number of classes
    'max_depth': 3,                # Maximum depth of trees
    'learning_rate': 0.09792447281018261,
    'n_estimators': 572,
    'subsample': 0.8802526465250708,
    'colsample_bytree': 0.12328759472404174,
    'min_child_weight': 9,

In [None]:
importances = model_2.feature_importances_


# Plotting feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(importances)), importances)
plt.xticks(range(len(importances)), feature_names, rotation=90)
plt.title('Feature Importances')
plt.show()

In [None]:
predictions_test = model_2.predict_proba(X_test_real)
df_xboost = pd.DataFrame(predictions_test, columns=['Status_C', 'Status_CL', 'Status_D'], index=index)
df_xboost.head()

In [None]:
accuracy = log_loss(y_test, predictions_xgb_test)
print(f'Accuracy: {accuracy}')

In [None]:
file_path = 'submission_xboost.csv'
df_xboost.to_csv(file_path) 

{'n_estimators': 572, 'learning_rate': 0.03970804555039867, 'max_depth': 10, 'subsample': 0.5048914610683402, 'colsample_bytree': 0.1299678713013594, 'min_child_weight': 14}

In [None]:
print(X_train.shape, y_train.shape)

To do: 
1. create neural network and tune it
2. stack the neural network with xgboost with logistic regression as the meta-learner
    a. try it with orginal features 
    b. try it without orginal features
3. scale the data

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout


# Define the neural network model
nn_model = keras.Sequential([
    keras.layers.Dense(204, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.367790476008353),
    keras.layers.Dense(3, activation='softmax')  # Three classes, softmax activation for multi-class
])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# Compile the model with log loss (cross-entropy)
nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0011553897364248928), loss='categorical_crossentropy', metrics=['accuracy'])

y_train_onehot = to_categorical(y_train, num_classes=3)
y_test_onehot = to_categorical(y_test, num_classes=3)
# Train the model
nn_model.fit(X_train, y_train_onehot, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])  # Adjust epochs and batch size


# Evaluate the model on test data with log loss
test_loss, test_accuracy = nn_model.evaluate(X_test, y_test_onehot)
print(f"Test Log Loss: {test_loss:.4f}")

predictions_nn_test = nn_model.predict(X_test)

predictions_nn = nn_model.predict(X_test_real)
print(predictions_nn)

In [None]:
import optuna
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Define the objective function to optimize
def objective(trial):
    # Define hyperparameters to search
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    num_units = trial.suggest_int('num_units', 16, 256, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
#     num_hidden_layers = trial.suggest_int('num_hidden_layers', 1, 5)
    
    model = keras.Sequential()
    # Build and compile the neural network with the sampled hyperparameters
    model.add(Dense(num_units, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(dropout_rate)) 

#     for _ in range(num_hidden_layers):
#         model.add(Dense(num_units, activation='relu'))
#         model.add(Dropout(dropout_rate))  # Dropout layer for regularization

    model.add(Dense(3, activation='softmax'))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    y_train_onehot = to_categorical(y_train, num_classes=3)
    y_test_onehot = to_categorical(y_test, num_classes=3)

    # Train the model
    history = model.fit(X_train, y_train_onehot, epochs=50, batch_size=32, validation_split=0.2, verbose=0, callbacks=[early_stopping])

    # Evaluate and return the validation loss (to minimize)
    val_loss = history.history['val_loss'][-1]
    return val_loss

# Create an Optuna study for hyperparameter optimization
study = optuna.create_study(direction='minimize')

# Run the optimization for a specified number of trials
study.optimize(objective, n_trials=30)

# Get the best hyperparameters and results
best_params = study.best_params
best_log_loss = study.best_value

print(f"Best Hyperparameters: {best_params}")
print(f"Best Log Loss: {best_log_loss:.4f}")



1 layer
Best Hyperparameters: {'learning_rate': 0.013170397942564903, 'num_units': 185}
Best Negative Log Loss: 0.6375

Best Hyperparameters: {'learning_rate': 0.0011553897364248928, 'num_units': 204, 'dropout_rate': 0.367790476008353}
Best Log Loss: 0.6235

Best Hyperparameters: {'learning_rate': 0.0009783562667604346, 'num_units': 31, 'dropout_rate': 0.06061700034277989, 'num_hidden_layers': 4}
Best Log Loss: 0.6521

scalled mult layers: Best Hyperparameters: {'learning_rate': 0.0005053505112715387, 'num_units': 42, 'dropout_rate': 0.36510169582829427, 'num_hidden_layers': 1}
Best Log Loss: 0.5100

no hidden layers
Best Hyperparameters: {'learning_rate': 0.0029784197548427148, 'num_units': 31, 'dropout_rate': 0.17430692789044092}
Best Log Loss: 0.4999

In [None]:
df_nn = pd.DataFrame(predictions_nn, columns=['Status_C', 'Status_CL', 'Status_D'], index=index)
df_nn.head()

In [None]:
file_path = 'submission_nn.csv'
df_nn.to_csv(file_path) 

In [None]:
import optuna
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import log_loss

# Define the objective function for KNeighborsClassifier
def knn_objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
    
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_classifier.fit(X_train, y_train)
    knn_probs = knn_classifier.predict_proba(X_test)
    
    logloss = log_loss(y_test, knn_probs)
    return logloss

# Define the objective function for SVC
def svc_objective(trial):
    C = trial.suggest_float('C', 0.1, 10.0)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    
    svc_classifier = SVC(C=C, kernel=kernel, probability=True)
    svc_classifier.fit(X_train, y_train)
    svc_probs = svc_classifier.predict_proba(X_test)
    
    logloss = log_loss(y_test, svc_probs)
    return logloss


# Create an Optuna study for each model's hyperparameter optimization
knn_study = optuna.create_study(direction='minimize')
svc_study = optuna.create_study(direction='minimize')

# Optimize hyperparameters for KNeighborsClassifier
knn_study.optimize(knn_objective, n_trials=50)

# Optimize hyperparameters for SVC
svc_study.optimize(svc_objective, n_trials=50)

# Get the best hyperparameters and results for each model
best_knn_params = knn_study.best_params
best_svc_params = svc_study.best_params
knn_logloss = knn_study.best_value
svc_logloss = svc_study.best_value


print("Best Hyperparameters for KNeighborsClassifier:", best_knn_params)
print("Best Hyperparameters for SVC:", best_svc_params)
print("KNeighborsClassifier Log Loss:", knn_logloss)
print("SVC Log Loss:", svc_logloss)



In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier

# Assuming you have already split your data into X_train and y_train for training
# and X_test for testing

# Create and train the KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=20) # You can adjust the number of neighbors
bagged_knn = BaggingClassifier(knn_classifier, n_estimators=100, random_state=42)
bagged_knn.fit(X_train, y_train)

# Make predictions using KNeighborsClassifier
knn_preds = bagged_knn.predict_proba(X_test)
# knn_preds_real = bagged_knn.predict_proba(X_test_real)

# Calculate accuracy for KNeighborsClassifier
knn_accuracy = log_loss(y_test, knn_preds)
print("KNeighborsClassifier Accuracy:", knn_accuracy)

# Create and train the Support Vector Classifier (SVC)
# svc_classifier = SVC(kernel='rbf', C=2.367159073790816)# You can choose a different kernel and C value
# bagged_svc = BaggingClassifier(svc_classifier, n_estimators=100, random_state=42)
# bagged_svc.fit(X_train, y_train)

# # Make predictions using SVC
# svc_preds = bagged_svc.decision_function(X_test)
# svc_preds_real = bagged_svc.decision_function(X_test_real)

# # Calculate accuracy for SVC
# svc_accuracy = log_loss(y_test, svc_preds)
# print("SVC Accuracy:", svc_accuracy)

# # Create and train the Gaussian Naive Bayes (GaussianNB) classifier
# gnb_classifier = GaussianNB()
# bagged_gnb = BaggingClassifier(gnb_classifier, n_estimators=100, random_state=42)
# bagged_gnb.fit(X_train, y_train)

# # Make predictions using GaussianNB
# gnb_preds = bagged_gnb.predict_proba(X_test)
# gnb_preds_real = bagged_gnb.predict_proba(X_test_real)

# # Calculate accuracy for GaussianNB
# gnb_accuracy = log_loss(y_test, gnb_preds)
# print("GaussianNB Accuracy:", gnb_accuracy)


In [None]:
import optuna
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# Define the LightGBM objective function for Optuna
def objective(trial):
    # Define hyperparameters to optimize
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_iterations": trial.suggest_int("num_iterations", 50, 200 ),
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 3,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 456),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 2, 100),
        "learning_rate": trial.suggest_float('learning_rate', 9e-2, .12, log=True)
    }

    # Create and train a LightGBM classifier
    clf = lgb.LGBMClassifier(**params, random_state=42)
    clf.fit(X_train, y_train)

    # Predict probabilities on the validation set
    y_prob = clf.predict_proba(X_test)

    # Calculate log loss as the metric to optimize
    logloss = log_loss(y_test, y_prob)

    return logloss

# Create an Optuna study
study = optuna.create_study(direction='minimize')  # We want to minimize log loss

# Run optimization (you can change the number of trials as needed)
study.optimize(objective, n_trials=50)

# Print the best hyperparameters and score
best_params = study.best_params
best_score = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Log Loss:", best_score)

# Train the final model using the best hyperparameters
best_params['objective'] = 'binary'
best_params['metric'] = 'binary_logloss'



In [None]:
final_params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 3,
        'num_iterations': 90, 
        'lambda_l1': 5.277563111764755e-08, 
        'lambda_l2': 4.782766023009482e-06, 
        'num_leaves': 346, 
        'feature_fraction': 0.26790241031301165, 
        'bagging_fraction': 0.8183489454210847, 
        'bagging_freq': 2, 
        'min_child_samples': 67, 
        'learning_rate': 0.0922843047704934
    }




final_model = lgb.LGBMClassifier(**final_params, random_state=42)
final_model.fit(X_train, y_train)

In [None]:
# stacked_preds = np.hstack((X_test, y_pred_forest_val, predictions_xgb_test, predictions_nn_test))
# stacked_preds_test = np.hstack((X_test_real, y_pred_forest, predictions_test, predictions_nn))

# stacked_preds = np.hstack((X_test, y_pred_forest_val, predictions_xgb_test, predictions_nn_test, gnb_preds, svc_preds, knn_preds))
# stacked_preds_test = np.hstack((X_test_real, y_pred_forest, predictions_test, predictions_nn, gnb_preds_real, svc_preds_real, knn_preds_real))
# print(stacked_preds)

In [None]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=10000, random_state=42)
model.fit(stacked_preds, y_test)
y_pred = model.predict_proba(stacked_preds)
accuracy = log_loss(y_test, y_pred)
print(f'Accuracy: {accuracy}')
# # Here we need probabilities not classifications. 
y_pred_real = model.predict_proba(stacked_preds_test)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

# Initializing Stratified K-Fold with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initializing an MLPClassifier
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=1000,
    random_state=42,
    activation='relu',
    learning_rate_init=0.001,
    solver='adam',
    validation_fraction=0.1,
    momentum=0.9,
    nesterovs_momentum=True,
    batch_size=32,
    beta_1=0.9,
    beta_2=0.999
)

# Creating a StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('XGB', model_2),
#         ('RFM', random_forest_model),
        ('LGBM', final_model)
    ],
    final_estimator=mlp,
    cv=skf
)

In [None]:
stacking_model.fit(X_train, y_train)

In [None]:
y_pred = stacking_model.predict_proba(X_test)
lloss = log_loss(y_test, y_pred)
print(f"Log loss on test data: {lloss}")

In [None]:
df = pd.DataFrame(y_pred_real, columns=['Status_C', 'Status_CL', 'Status_D'], index=index)
df.head()
file_path = 'submission_stacked.csv'
df.to_csv(file_path) 