In [2]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


# Libraries

In [9]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import optuna
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Loading Data

In [10]:
# Load the data
X_train = pd.read_csv("/kaggle/input/ensembledata/X_train.csv")
y_train = pd.read_csv("/kaggle/input/ensembledata/y_train.csv")
X_test = pd.read_csv("/kaggle/input/ensembledata/X_test.csv")

# Define preprocessing for numeric columns (scale them)
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns (encode them)
categorical_features = ['COUNTRY']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Data Processing

In [11]:
# Preprocessing: Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Splitting the preprocessed training data
X_train_pp, X_val_pp, y_train_pp, y_val_pp = train_test_split(X_train_preprocessed, y_train['TARGET'], test_size=0.2, random_state=42)

# Define the Neural Network model
def build_model(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(), loss='mean_squared_error')
    return model


# Stacking

In [12]:
# Create the model
model_nn = build_model(X_train_pp.shape[1])

# Train the model
history = model_nn.fit(X_train_pp, y_train_pp, validation_data=(X_val_pp, y_val_pp), epochs=100, batch_size=32, verbose=0)

# Define more diverse base models
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)),
    ('svr', SVR(C=1.0, kernel='rbf'))
]

# Use Ridge as a meta-model in the stacking ensemble
meta_model = Ridge(alpha=1.0)

# Create and train the Stacking Regressor with the enhanced dataset
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train_pp, y_train_pp)

# Making predictions with the optimized neural network
y_pred_nn_optimized = model_nn.predict(X_val_pp)

# Making predictions with the enhanced stacking model
y_pred_stacking_enhanced = stacking_model.predict(X_val_pp)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


# Evaluation

In [15]:
# Evaluate Spearman's correlation for both models
spearman_corr_nn_optimized = spearmanr(y_val_pp, y_pred_nn_optimized.flatten()).correlation
spearman_corr_stacking_enhanced = spearmanr(y_val_pp, y_pred_stacking_enhanced).correlation

print(f"Spearman's Correlation (Optimized NN): {spearman_corr_nn_optimized}")
print(f"Spearman's Correlation (Enhanced Stacking Model): {spearman_corr_stacking_enhanced}")


Spearman's Correlation (Optimized NN): 0.09997530919620212
Spearman's Correlation (Enhanced Stacking Model): 0.18587057529572848


In [16]:
# Splitting the preprocessed training data into training and validation sets
X_train_pp, X_val_pp, y_train_pp, y_val_pp = train_test_split(X_train_preprocessed, y_train['TARGET'], test_size=0.2, random_state=42)

# Making predictions with the enhanced stacking model for the validation set
y_pred_stacking_enhanced_val = stacking_model.predict(X_val_pp)

# Get the IDs for the validation data
ids_val = X_val_pp[:, 0].astype(int).tolist()  

# Create a DataFrame with IDs and predictions for the validation set
submission_df_val = pd.DataFrame({'ID': ids_val, 'TARGET': y_pred_stacking_enhanced_val})

# Convert DataFrame to the desired format for the validation set
submission_str_val = submission_df_val.to_string(index=False, header=False)

# Print the submission string for the validation set
print("Validation Set Submission:")
print(submission_str_val)



Validation Set Submission:
 1  0.385730
 0 -0.142669
-1 -0.360967
 0  0.143502
 0  0.335660
 0  0.224267
 1 -0.506056
 0 -0.199526
 0  0.098745
-1  0.030663
 0  0.234798
 0 -0.148689
 0 -0.021435
 0  0.205603
 0  0.130668
-1  0.164373
 0  0.272987
 0  0.174561
 0  0.007449
 0 -0.148609
 0  0.113314
-1  0.008144
 0  0.174239
-1  0.436106
 0  0.174285
-1 -0.112014
-1 -0.336527
 1  0.116358
 0  0.155931
-1  0.310359
-1 -0.722785
 1  0.027468
 0  0.195352
 0  0.223131
-1  0.163331
-1 -0.331053
 1  0.024022
 1  0.126847
 0  0.156483
 0 -0.431718
 0  0.032921
 0  0.186703
 0  0.206833
 0  0.196431
-1 -0.224969
 0  0.277308
 1  0.264442
 0  0.119537
 0  0.033989
 1  0.111260
-1  0.106485
-1 -0.588625
 1  0.021688
 1  0.130151
 1  0.391666
 1  0.016949
 0 -0.153407
-1  0.593954
 0 -0.714058
 0  0.493959
 0  0.279444
 1 -0.019407
 0 -0.341923
 0 -0.335535
 1  0.122484
 0  0.099114
 0  0.226428
-1  0.194059
-1  0.353395
 1  0.081679
-1 -0.094773
 0  0.079914
 0 -0.178814
 1 -0.066044
 0  0.27801

In [17]:
# Get the IDs for the test data
ids_test = X_test['ID'].tolist()  

# Making predictions with the enhanced stacking model for the test set
y_pred_stacking_enhanced_test = stacking_model.predict(X_test_preprocessed)

# Create a DataFrame with IDs and predictions for the test set
submission_df_test = pd.DataFrame({'ID': ids_test, 'TARGET': y_pred_stacking_enhanced_test})

# Convert DataFrame to the desired format for the test set
submission_str_test = submission_df_test.to_string(index=False, header=False)

# Print the submission string for the test set
print("\nTest Set Submission:")
print(submission_str_test)



Test Set Submission:
1115  0.146479
1202  0.240982
1194 -0.086668
1084  0.211653
1135 -0.018415
 960  0.144865
1131  0.028825
 996  0.154973
1085  0.289090
1063  0.082859
1035 -0.008688
 997  0.102919
1042  0.089500
 954  0.179150
1126 -0.030990
1122 -0.024785
1167 -0.007304
1125 -0.065312
1019  0.053080
 982 -0.069090
1175  0.244235
1083  0.323779
1161 -0.059239
1061  0.096787
1044  0.096783
 948  0.070966
 949  0.038321
 941 -0.033111
1001 -0.073932
1190 -0.218211
 943  0.171854
1011  0.127655
 975  0.237501
1213  0.286711
1127  0.316305
1039  0.048255
1196 -0.182632
1171  0.009847
2028  0.458484
1746  0.271633
1608  0.254469
1356  0.024825
2008  0.071806
1390  0.093924
1337 -0.079768
1830  0.180385
1217  0.032571
2037  0.047751
1989 -0.001136
1984 -0.049833
2147  0.101658
1496  0.153134
1521  0.006032
1935 -0.009252
1994 -0.024508
1389  0.216511
1433 -0.119492
1371  0.082831
1469  0.180896
1956  0.057778
1889  0.015718
1708  0.078252
2071  0.186923
1407  0.220202
1339 -0.141910
158

In [19]:
# Create a DataFrame with IDs and predictions for the validation set
submission_df_val = pd.DataFrame({'ID': ids_val, 'TARGET': y_pred_stacking_enhanced_val})

# Save the validation set submission to CSV
submission_df_val.to_csv('submission_validation.csv', index=False)

# Create a DataFrame with IDs and predictions for the test set
submission_df_test = pd.DataFrame({'ID': ids_test, 'TARGET': y_pred_stacking_enhanced_test})

# Save the test set submission to CSV
submission_df_test.to_csv('submission_test.csv', index=False)

print("Submission files saved successfully.")


Submission files saved successfully.
