In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load data
data = pd.read_csv('/kaggle/input/titanic/train.csv')
test=pd.read_csv('/kaggle/input/titanic/test.csv')

# --- Feature Engineering ---

# 1. Title from Name
def extractdata(data):
    data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    # Group rare titles
    rare_titles = ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    data['Title'] = data['Title'].replace(rare_titles, 'Rare')
    data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    
    # 2. Name Length
    data['NameLength'] = data['Name'].apply(len)
    
    # 3. Ticket Prefix
    data['TicketPrefix'] = data['Ticket'].str.replace('[. /]', '', regex=True).str.extract('([A-Za-z]+)', expand=False)
    data['TicketPrefix'] = data['TicketPrefix'].fillna('None')
    
    # 4. Ticket Length
    data['TicketLength'] = data['Ticket'].apply(len)
    
    # 5. Group Size and IsGroupTicket
    ticket_counts = data['Ticket'].value_counts()
    data['GroupSize'] = data['Ticket'].map(ticket_counts)
    data['IsGroupTicket'] = (data['GroupSize'] > 1).astype(int)
    
    # --- Handle Missing Values ---
    data['Age'] = data['Age'].replace(0, np.nan)
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Fare'] = data['Fare'].fillna(data['Fare'].median())
    data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
    
    # --- Encode Categorical Variables ---
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    le_title = LabelEncoder()
    data['Title'] = le_title.fit_transform(data['Title'])
    
    le_prefix = LabelEncoder()
    data['TicketPrefix'] = le_prefix.fit_transform(data['TicketPrefix'])
    
    # --- Feature Selection ---
    features = [
        'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
        'Title', 'NameLength', 'TicketPrefix', 'TicketLength', 'GroupSize', 'IsGroupTicket'
    ]
    X = data[features]
    return X

X=extractdata(data)
y = data['Survived']

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scaling ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test



2025-04-25 04:01:43.106000: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745553703.345942      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745553703.417303      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


array([[-0.7243102 , -0.09263364,  0.37992316, ..., -1.01514299,
         0.12847889,  1.23121934],
       [-0.7243102 ,  0.13815631, -0.47072241, ...,  1.18622293,
        -0.5918122 , -0.81220297],
       [-0.7243102 , -0.7080735 , -0.47072241, ...,  3.38758885,
        -0.5918122 , -0.81220297],
       ...,
       [ 1.38062393,  0.67666619,  0.37992316, ..., -0.28135435,
         1.56906107,  1.23121934],
       [ 1.38062393, -0.93886345, -0.47072241, ...,  1.18622293,
        -0.5918122 , -0.81220297],
       [ 1.38062393, -1.93895323,  0.37992316, ...,  0.08553997,
         0.12847889,  1.23121934]])

In [3]:
# --- Build the ANN ---
from tensorflow.keras.activations import gelu
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# --- Train the ANN ---
model.fit(X_train, y_train, epochs=50, batch_size=60, validation_data=(X_test, y_test))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-04-25 04:01:57.593393: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.6582 - loss: 0.6598 - val_accuracy: 0.6872 - val_loss: 0.6098
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6686 - loss: 0.6329 - val_accuracy: 0.7765 - val_loss: 0.5637
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7119 - loss: 0.5957 - val_accuracy: 0.8045 - val_loss: 0.5282
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7677 - loss: 0.5541 - val_accuracy: 0.8212 - val_loss: 0.4998
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7767 - loss: 0.5282 - val_accuracy: 0.8268 - val_loss: 0.4789
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8010 - loss: 0.4913 - val_accuracy: 0.8268 - val_loss: 0.4636
Epoch 7/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7ba493c79b90>

In [4]:
# --- Evaluate ---
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.3f}')


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8566 - loss: 0.4086 
Test Accuracy: 0.849


In [5]:
X_test=extractdata(test)
np.array(X_test)

array([[ 0. , 34.5,  0. , ...,  6. ,  1. ,  0. ],
       [ 1. , 47. ,  1. , ...,  6. ,  1. ,  0. ],
       [ 0. , 62. ,  0. , ...,  6. ,  1. ,  0. ],
       ...,
       [ 0. , 38.5,  0. , ..., 18. ,  1. ,  0. ],
       [ 0. , 27. ,  0. , ...,  6. ,  1. ,  0. ],
       [ 0. , 27. ,  1. , ...,  4. ,  1. ,  0. ]])

In [7]:
X_test = scaler.transform(X_test)
kaggle_predictions = (model.predict(X_test) > 0.5).astype(int).flatten() # processed `test.csv`
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": kaggle_predictions
})
submission.to_csv('submission.csv', index=False)
print(submission)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]




In [8]:
import os
print(os.listdir('/kaggle/working'))

['submission.csv', '.virtual_documents']


In [9]:
from IPython.display import FileLink

# Ensure the file is saved to /kaggle/working
submission.to_csv('/kaggle/working/submission.csv', index=False)

# Create and display a clickable download link
FileLink('/kaggle/working/submission.csv')