In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load training data
training_data = pd.read_csv("/kaggle/input/titanic/train.csv")

# Load testing data
testing_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# Concatenate training and testing data to split it later
all_data = pd.concat([training_data, testing_data])

# Data exploration

In [None]:
# I think name prefix would be a feature
training_data.head()

In [None]:
# Check for nulls values
training_data.info()

In [None]:
# Check Ticket classes. It seems Ticket number is a kind of ID for tripulants. Due to unique tickets numbers against rows numbers, 
# i can say this ticket would be a kind of unique id.  
training_data["Ticket"].value_counts()

In [None]:
# Check how many numeric ticked number exists. This would be a new feature, because is the best way to use Ticket column.
training_data[training_data["Ticket"].str.isnumeric() == True]

In [None]:
# Check Cabins per tripulant but this feature wont be used due to huge amount of NaN values
training_data["Cabin"].value_counts()

In [None]:
# Print count of dead people depending on specific classes

print(pd.pivot_table(training_data, index="Survived", columns="Sex", values="Ticket", aggfunc="count"), "\n")

print(pd.pivot_table(training_data, index="Survived", columns="Pclass", values="Ticket", aggfunc="count"), "\n")

print(pd.pivot_table(training_data, index="Survived", columns="Embarked", values="Ticket", aggfunc="count"))

In [None]:
# Plot histogram of Fare to observe distribution. This feature will be used, due to complement with NumericTicket
plt.hist(x=training_data["Fare"], density=True)
plt.show()

In [None]:
# Plot histogram of Age to observe distribution 
plt.hist(x=training_data["Age"], density=True)
plt.show()

In [None]:
# Plot histogram of familiars to observe distribution. These columns means that someone is in group or alone.
familiars_data = training_data["SibSp"] + training_data["Parch"]
plt.hist(x=familiars_data, density=True)
plt.show()

In [None]:
# Plot survived people (mean) taking into account Sex
sns.barplot(x=training_data["Sex"], y=training_data["Survived"])
plt.show()

In [None]:
# Plot survived (mean) people taking into account Embarked
sns.barplot(x=training_data["Embarked"], y=training_data["Survived"])
plt.show()

In [None]:
# Plot survived (mean) people taking into account Pclass
sns.barplot(x=training_data["Pclass"], y=training_data["Survived"])
plt.show()

# Prepare training and testing data

In [None]:
training_data.head()

In [None]:
testing_data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

def prepare_data(df, is_training=True):
    # Create a new feature based on SibSp and Parch. These columns are related with if a tripulant have a group, so sum them
    df["GroupSize"] = df["SibSp"] + df["Parch"] + 1

    # Ticket is kind of ID for username but sometimes a bunch of tripulants get the same Ticket number.
    # Create a feature taking into account if Ticket number is numeric or not
    df["TicketNumeric"] = df.apply(lambda row: 1 if row["Ticket"].isnumeric() else 0, axis=1)

    # Create a new Feature based on name prefix (Not considered, makes model overfits)
    # df["NamePrefix"] = df.apply(lambda row: row["Name"].split(",")[1].split(".")[0].strip(), axis=1)
    
    # Create a new Feature based on how many cabins a tripulant has
    df["CabinNumber"] = df.apply(lambda row: 0 if pd.isna(row["Cabin"]) else len(row["Cabin"].split(" ")), axis=1)

    # Handle Age NaN values as filling them with median
    df["Age"].fillna(df["Age"].mean(), inplace=True)
    
    # Handle Fare NaN values on Testing set as filling them with median
    df["Fare"].fillna(df["Fare"].mean(), inplace=True)

    # Drop unused columns
    df.drop(["SibSp", "Parch", "PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)

    # Drop NaN Embarked rows. I did it because there are only 2 rows
    df.dropna(subset=["Embarked"], inplace=True)
    
    # Label encoder to convert categorical data into numerical data
    df_encoded = df.copy()
    label_encoder = LabelEncoder()
    df_encoded["Pclass"] = label_encoder.fit_transform(df_encoded["Pclass"])
    df_encoded["Embarked"] = label_encoder.fit_transform(df_encoded["Embarked"])
    df_encoded["Sex"] = label_encoder.fit_transform(df_encoded["Sex"])
    
    # Standard Scaler
    scale = StandardScaler()
    # df_encoded[["Age", "Fare", "CabinNumber", "GroupSize"]] = scale.fit_transform(df_encoded[["Age", "Fare", "CabinNumber", "GroupSize"]])
    df_encoded[["Age", "Fare"]] = scale.fit_transform(df_encoded[["Age", "Fare"]])
    
    return df_encoded

# Prepare training and testing data
training_data_encoded = prepare_data(training_data.copy())
testing_data_encoded = prepare_data(testing_data.copy())

In [None]:
training_data_encoded.head()

In [None]:
testing_data_encoded.head()

# Split data

In [None]:
def split_data(df, is_training=True):
    if is_training:
        # Clean Survived column
        df["Survived"] = df["Survived"].astype(int)
        
        # Get X and Y
        feature_columns = df.columns[1:]
        X = df.loc[:, feature_columns].to_numpy()
        Y = df["Survived"].to_numpy()
    else:
        # Get X, Y is None
        feature_columns = df.columns
        X = df.loc[:, feature_columns].to_numpy()
        Y = None
    return X, Y

# Split data
X_train, Y_train = split_data(training_data_encoded)
X_test, _ = split_data(testing_data_encoded, is_training=False)

In [None]:
print(X_train.shape, X_test.shape, Y_train.shape)

# Model
Try to test different models and choose the best one.

In [None]:
cross_valid_param = 10

## KNN

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier()

# Train and Cross Validation
knn_cross_valid = cross_val_score(knn_classifier, X_train, Y_train, cv=cross_valid_param)

# Print accuracy
print(f"KNN Cross Validation accuracy: {round(knn_cross_valid.mean() * 100, 2)}")

In [None]:
# Fit model
knn_classifier.fit(X_train, Y_train)

# Predict output
knn_predicted = knn_classifier.predict(X_test).astype(int)
print(knn_predicted)

## Linear SVM

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svc_classifier = SVC(probability = True, C=1, gamma=0.1, kernel= "rbf")

# Train and Cross Validation
svc_cross_valid = cross_val_score(svc_classifier, X_train, Y_train, cv=cross_valid_param)

# Print accuracy
print(f"Linear SVM Cross Validation accuracy: {round(svc_cross_valid.mean() * 100, 2)}")

In [None]:
# Fit model
svc_classifier.fit(X_train, Y_train)

# Predict output
svc_predicted = svc_classifier.predict(X_test).astype(int)
print(svc_predicted)

## Multilayer Perceptron

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

mlp_nn = MLPClassifier(solver='lbfgs', alpha=2e-5, hidden_layer_sizes=(5, 2), activation="relu", random_state=1, max_iter=1269)

# Train and Cross Validation
mlp_cross_valid = cross_val_score(mlp_nn, X_train, Y_train, cv=cross_valid_param)

# Print accuracy
print(f"MLP Neural Network Cross Validation accuracy: {round(svc_cross_valid.mean() * 100, 2)}")

In [None]:
# Fit model
mlp_nn.fit(X_train, Y_train)

# Predict output
mlp_predicted = mlp_nn.predict(X_test).astype(int)
print(mlp_predicted)

In [None]:
# Prepare submission
final_data = { "PassengerId": testing_data["PassengerId"], "Survived": mlp_predicted }
final_df = pd.DataFrame(final_data)
final_df.head()

In [None]:
# Save submission
final_df.to_csv('submission.csv', index=False)