In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, LeakyReLU
from tensorflow.keras.optimizers import Adam

def dfchange(train_df):
    train_df['Group'] = train_df['PassengerId'].apply(lambda x: x.split('_')[0])
    train_df['Number'] = train_df['PassengerId'].apply(lambda x: x.split('_')[1])
    train_df['GroupSize'] = train_df['Group'].apply(lambda x: int(x.lstrip('0')))
    train_df['Side'] = train_df['Cabin'].str[-1]
    train_df['CryoSleep'] = train_df['CryoSleep'].map({True: 1, False: 0})
    train_df['CryoSleep'] = train_df['CryoSleep'].fillna(0.36)
    train_df['Side'] = train_df['Side'].map({'P': 1, 'S': 0})
    train_df['Side'] = train_df['Side'].fillna(0.5)
    train_df['VIP'] = train_df['VIP'].map({True: 1, False: 0})
    train_df['VIP'] = train_df['VIP'].fillna(0.03)
    return train_df

2024-07-07 17:35:30.873436: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-07 17:35:30.873589: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 17:35:31.035075: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load the training data
train_df = pd.read_csv('/kaggle/input/summer-school-24-25-hackathon/train.csv')
train_df = dfchange(train_df)

In [3]:
# Separate features and target variable
X = train_df.drop(columns=['Transported', 'Name', 'PassengerId', 'Group', 'Cabin'])
y = train_df['Transported'].astype(int)

X.to_csv('/kaggle/working/updatedtrain.csv', index=False)

# Define preprocessing for numerical and categorical features
numerical_features = ['Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'GroupSize', 'Number', 'CryoSleep', 'Side']
categorical_features = ['HomePlanet', 'Destination']

In [4]:
# Preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the features
X_preprocessed = preprocessor.fit_transform(X)

# Split the data into training and test sets with stratified split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Build the neural network model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='tanh'),
    Dense(32, activation='tanh'),
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [6]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7338 - loss: 0.1794 - val_accuracy: 0.7922 - val_loss: 0.1461
Epoch 2/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7955 - loss: 0.1451 - val_accuracy: 0.7994 - val_loss: 0.1429
Epoch 3/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7882 - loss: 0.1432 - val_accuracy: 0.7944 - val_loss: 0.1422
Epoch 4/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7969 - loss: 0.1416 - val_accuracy: 0.7951 - val_loss: 0.1426
Epoch 5/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7967 - loss: 0.1401 - val_accuracy: 0.8037 - val_loss: 0.1411
Epoch 6/20
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7881 - loss: 0.1407 - val_accuracy: 0.7951 - val_loss: 0.1419
Epoch 7/20
[1m174/174[0m 

In [7]:
# Load the test data
test_df = pd.read_csv('/kaggle/input/summer-school-24-25-hackathon/test.csv')
test_df = dfchange(test_df)

# Separate the features from the test dataset
X_test_final = test_df.drop(columns=['Name', 'PassengerId', 'Group', 'Cabin'])

# Preprocess the test features
X_test_final_preprocessed = preprocessor.transform(X_test_final)

# Make predictions using the trained model
predictions = model.predict(X_test_final_preprocessed)

# Convert predictions to boolean values (True/False)
predicted_transportation = (predictions >= 0.5).astype(bool).flatten()

# Create a DataFrame with PassengerId and Transported columns
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': predicted_transportation
})

# Save the DataFrame to a CSV file
submission_df.to_csv('/kaggle/working/susmission leakyrelu+crossentropy.csv', index=False)

[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
