In [None]:
# Import necessary libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load the Titanic dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

## Explore dataset information

In [None]:
train_df.info()

In [None]:
test_df.info()

## Data Visualization

In [None]:
sns.set(style="darkgrid")

In [None]:
# Visualize class distribution
sns.countplot(data=train_df, x='Survived')
plt.xlabel('Categories')
plt.ylabel('Count')
plt.show()
train_df['Survived'].value_counts()

In [None]:
# Visualize Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Age'])
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Visualize SibSp Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['SibSp'])
plt.title("Families (siblings and spouses)")
plt.xlabel("Number")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Visualize Parch Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Parch'])
plt.title("Families (parents and children)")
plt.xlabel("Number")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Visualize Pclass Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Pclass'])
plt.title("Ticket Class")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Visualize Sex Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Sex'])
plt.title("Sex distribution")
plt.xlabel("Sex")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Visualize Embarked Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Embarked'])
plt.title("Port of Embarkation")
plt.xlabel("Port")
plt.ylabel("Frequency")
plt.show()

## Data Preprocessing

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

In [None]:
test_df.isnull().sum().sort_values(ascending=False)

In [None]:
# Categorical labels encoding
label_encoder = LabelEncoder()

In [None]:
def encoding(data):
    categories = data.select_dtypes("object").columns
    for i in categories:
        data[i]=label_encoder.fit_transform(data[i])

In [None]:
encoding (train_df)
encoding (test_df)

In [None]:
# Standardize numerical features
def scaler(data, column_to_exclude):
    scaler = MinMaxScaler()
    numerical_features = data.select_dtypes(include=['number'])
    
    # Exclude PassengerId column from numerical_features
    if column_to_exclude in numerical_features.columns:
        numerical_features = numerical_features.drop(column_to_exclude, axis=1)
    
    # Standardize all numerical features
    data[numerical_features.columns] = scaler.fit_transform(numerical_features)

In [None]:
scaler(train_df, 'PassengerId')
scaler(test_df, 'PassengerId')

In [None]:
train_df.head()

In [None]:
# Replace missing values with mean (Age) - training set
A_filled = train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)

print(A_filled)

In [None]:
# Replace missing values with median (Cabin) - training set
C_filled = train_df['Cabin'].fillna(train_df['Cabin'].median())

print(C_filled)

In [None]:
# Replace missing values with median (Embarked) - training set
E_filled = train_df['Embarked'].fillna(train_df['Embarked'].median())

print(E_filled)

In [None]:
# Replace missing values with mean (Age) - test set
A_filled = test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

print(A_filled)

In [None]:
# Replace missing values with median (Cabin) - test set
C_filled = test_df['Cabin'].fillna(test_df['Cabin'].median())

print(C_filled)

In [None]:
# Replace missing values with median (Fare) - test set
F_filled = test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

print(F_filled)

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

In [None]:
test_df.isnull().sum().sort_values(ascending=False)

In [None]:
train_df.corr()

In [None]:
correlations = train_df.corr()['Survived'].drop('Survived')
sorted_correlations = correlations.abs().sort_values(ascending=False)
sorted_correlations

In [None]:
sns.barplot(x=sorted_correlations.index, y=sorted_correlations)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Absolute Correlation')
plt.show()

## Evaluate the Classifier

In [None]:
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

In [None]:
# training set size
X_train.shape

In [None]:
# test set size
test_df.shape

In [None]:
# Confusion Matrix Function
def Confusion_Matrix(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    # Create a heatmap for visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
# Evaluation Function
def evaluate_classifier(classifier):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }
    report=classification_report(y_test, y_pred)
    return metrics,report

## Decision Tree Classifier 

In [None]:
model = DecisionTreeClassifier(random_state=42)

In [None]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
dt_grid = GridSearchCV(model, parameters, cv=5, scoring='accuracy')

In [None]:
dt_grid.fit(X_train, y_train)

In [None]:
best_params = dt_grid.best_params_
best_estimator = dt_grid.best_estimator_

In [None]:
best_estimator

In [None]:
# Confusion Matrix for Decision Tree Classifier
Confusion_Matrix(best_estimator)

In [None]:
# Evaluate the Decision Tree Classifier
evaluate_dt_grid = evaluate_classifier(best_estimator)

In [None]:
# Classification report
print(evaluate_dt_grid[1])

In [None]:
y_test = best_estimator.predict(X_test)

In [None]:
# Generate Output DataFrame
output = pd.DataFrame(
    data={
        'PassengerId': test_df['PassengerId'],
        'Survived': y_test
    }
)
output.sample(15, random_state=42)

In [None]:
output.info()

In [None]:
# Saving the DataFrame as a csv-file
output.to_csv('submission.csv', index=False)

# Displaying a success message
print("The submission has been successfully saved.")