In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('data/titanic'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Typical workflow: 
1. data exploration (visualization, understanding the data/problem)
2. cleaning/imputation (e.g. `fillna`, removal of redundant/least important columns etc.). 
3. model selection, training (is this problem classification or regression; selection of the most efficient model)
4. submit prediction

# 1. EDA - Exploratory Data Analysis

## Read datacard of competition

https://www.kaggle.com/competitions/titanic/data

## Read data

In [10]:
from pathlib import Path
Path('./data/titanic/train.csv')

False

In [14]:
pwd

'C:\\Users\\dawid\\projects\\kaggle-tutorial'

In [13]:
train = pd.read_csv(Path('./data/titanic/train.csv'), index_col='PassengerId')
test = pd.read_csv(Path('./data/titanic/test.csv'), index_col='PassengerId')
train.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'data\\titanic\\train.csv'

In [None]:
train.describe() # stats of each column

In [None]:
train.dtypes # any suggestions/conclusions at this step?

In [None]:
print(train.isnull().sum(), '\n')
print(test.isnull().sum())

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def eda_titanic_data(df):
    # Distribution of target variable 'Survived'
    plt.figure(figsize=(6, 4))
    sns.countplot(x='Survived', data=df)
    plt.title('Survival Distribution')
    plt.show()

    # Distribution of 'Age'
    plt.figure(figsize=(6, 4))
    sns.histplot(df['Age'], kde=True, bins=10)
    plt.title('Age Distribution')
    plt.show()

    # Distribution of 'Fare'
    plt.figure(figsize=(6, 4))
    sns.histplot(df['Fare'], kde=True, bins=10)
    plt.title('Fare Distribution')
    plt.show()

    plt.figure(figsize=(6, 4))
    sns.histplot(np.log1p(df['Fare']), kde=True, bins=30)
    plt.title('log(1+Fare) Distribution')
    plt.show()
    
    # Survival rate by Sex
    plt.figure(figsize=(6, 4))
    sns.countplot(x='Sex', hue='Survived', data=df)
    plt.title('Survival Rate by Sex')
    plt.show()

    # Survival rate by Pclass
    plt.figure(figsize=(6, 4))
    sns.countplot(x='Pclass', hue='Survived', data=df)
    plt.title('Survival Rate by Pclass')
    plt.show()

    # Box Plot for Age vs Survived
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='Survived', y='Age', data=df, palette='coolwarm')
    plt.title('Age Distribution by Survival')
    plt.show()

    # Correlation heatmap of numeric features
    plt.figure(figsize=(8, 6))
    corr = df.select_dtypes(include=[np.number]).corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.show()

eda_titanic_data(train)

# 2. Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

def preprocess_data(train_df, test_df):

    # Drop the 'Name', 'Cabin', 'Ticket' columns from both train and test datasets
    train_df = train_df.drop(['Name', 'Cabin', 'Ticket'], axis=1)
    test_df = test_df.drop(['Name', 'Cabin', 'Ticket'], axis=1)

    # Fill missing Age values with the median
    median = train_df['Age'].median()
    train_df['Age'] = train_df['Age'].fillna(median)
    test_df['Age'] = test_df['Age'].fillna(median)

    # Fill missing Embarked values with the most frequent value (mode)
    mode = train_df['Embarked'].mode()[0]
    train_df['Embarked'] = train_df['Embarked'].fillna(mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(mode)

    # Fill missing Fare values with the median and log
    median = train_df['Fare'].median()
    train_df['Fare'] = np.log1p(train_df['Fare'].fillna(median))
    test_df['Fare'] = np.log1p(test_df['Fare'].fillna(median))

    # Encoding categorical features: 'Sex', 'Embarked'
    label_encoder = LabelEncoder()
    train_df['Sex'] = label_encoder.fit_transform(train_df['Sex'])
    test_df['Sex'] = label_encoder.transform(test_df['Sex'])
    
    train_df['Embarked'] = label_encoder.fit_transform(train_df['Embarked'])
    test_df['Embarked'] = label_encoder.transform(test_df['Embarked'])

    # Scale numeric features: Age, Fare
    scaler = StandardScaler()
    train_df[['Age', 'Fare']] = scaler.fit_transform(train_df[['Age', 'Fare']])
    test_df[['Age', 'Fare']] = scaler.transform(test_df[['Age', 'Fare']])
    
    return train_df, test_df

train, test = preprocess_data(train, test)

In [None]:
# Features and labels
from sklearn.model_selection import train_test_split

X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X

In [None]:
y

# 3. Model selection & training

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

# Logistic Regression model
logreg = LogisticRegression(max_iter=200)

# Train the model
logreg.fit(X_train, y_train)

# Make predictions
y_pred = logreg.predict(X_valid)

# Evaluate the model
print("Logistic Regression Accuracy: ", accuracy_score(y_valid, y_pred))
sns.heatmap(confusion_matrix(y_valid, y_pred), annot=True, cmap='coolwarm', fmt='.2f')
print("Classification Report:\n", classification_report(y_valid, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_valid)

# Evaluate the model
print("Random Forest Accuracy: ", accuracy_score(y_valid, y_pred))
sns.heatmap(confusion_matrix(y_valid, y_pred), annot=True, cmap='coolwarm', fmt='.2f')
print("Classification Report:\n", classification_report(y_valid, y_pred))

# 4. Submit prediction

In [None]:
# Make prediction for test data
test_preds = logreg.predict(test)

# Save prediction in required format
pd.DataFrame({"PassengerId": test.index, "Survived": test_preds}).to_csv('./submission.csv', index=False)

In [None]:
pd.read_csv('./submission.csv')