Titanic: Machine Learning from Disaster

- Defining the problem statement
- Collecting the data
- Exploratory data analysis
- Feature engineering
- Modelling
- Testing


1. Defining the problem statement
- Complete the analysis of what sorts of people were likely to survive on the titanic.

2. Collecting the data
- Data came from kaggle here. 
https://www.kaggle.com/competitions/titanic/data

In [57]:
import pandas as pd

train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')


3. Exploratory data analysis

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv('titanic/train.csv')

sns.countplot(x='Survived', data=train)
plt.show()

sns.boxplot(x='Survived', y='Fare', data=train)
plt.show()

train['Survived'].hist()
plt.show()

In [None]:
train = pd.read_csv('titanic/test.csv')

sns.countplot(x='Age', data=test)
plt.show()

sns.boxplot(x='Age', y='Fare', data=test)
plt.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
train = pd.read_csv('titanic/train.csv')

# Handle missing values
train['Age'] = train['Age'].fillna(train['Age'].median())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

# Drop the cabin column
train = train.drop(columns='Cabin')

# Convert categorical variables to numeric
train['Sex'] = train['Sex'].map({'male': 1, 'female': 0})
train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)

# Feature Scaling
scaler = StandardScaler()
train[['Age', 'Fare']] = scaler.fit_transform(train[['Age', 'Fare']])

# Feature Engineering
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)

# Final DataFrame for modeling
print(train.head())




In [None]:
def charts(feature, figsize=(8, 6)):
    survived = train[train['Survived'] == 1][feature].value_counts()
    dead = train[train['Survived'] == 0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived', 'Dead']
    
    plt.figure(figsize=figsize)
    df.plot(kind='bar', stacked=True)
    plt.show()

charts('Sex', figsize=(5, 4))

Woman survived more as woman boarded the boats first.

In [None]:
charts('Pclass')

1st class survived more than other classes

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train)
plt.show()

sns.barplot(x='Sex', y='Survived', data=train)
plt.show()


4.  Feature engineering

In [None]:
# Define Feature Matrix (X) and Target Variable (y)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the feature set (X) and target variable (y)
X = train[['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone']]
y = train['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and validation sets
print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")



In [None]:
# does this location make sense?

train['Title'] = train['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
sns.barplot(x='Title', y='Survived', data=train)
plt.xticks(rotation=45)
plt.show()


In [None]:
# Logistic Regression Model

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = logreg.predict(X_val)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")


Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


True Negatives: model correctly predicted that 91 passengers did not survive.
False Positives: model incorrectly predicted that 14 passengers survived, but they did not.
False Negatives: model incorrectly predicted that 21 passengers did not survive, but they actually did.
True Positives: model correctly predicted that 53 passengers survived.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Compute and display confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate Accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate Precision
precision = precision_score(y_val, y_pred)
print(f"Precision: {precision:.4f}")

# Calculate Recall
recall = recall_score(y_val, y_pred)
print(f"Recall: {recall:.4f}")

# Calculate F1 Score
f1 = f1_score(y_val, y_pred)
print(f"F1 Score: {f1:.4f}")



Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
print(f"Random Forest Accuracy: {accuracy_score(y_val, y_pred_rf):.4f}")


In [None]:


# Get feature importance from the Random Forest model
feature_importance = rf_model.feature_importances_

# Create a DataFrame for better visualization
features = X_train.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})

# Sort by importance for a cleaner plot
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.show()


Demostrating that fare is the most important feature for survival

In [None]:
# Create a FacetGrid
g = sns.FacetGrid(train, col="Survived")
g.map(plt.hist, "Age", bins=20)
plt.show()

# Create a FacetGrid
g = sns.FacetGrid(train, col="Survived", hue="Sex", palette="Set1", col_wrap=2)
g.map(plt.scatter, "Age", "Fare", alpha=.7)
g.add_legend()
plt.show()

# # Create a FacetGrid with specified order for 'Sex'
# g = sns.FacetGrid(train, row="Pclass", col="Survived", margin_titles=True)
# g.map_dataframe(sns.boxplot, x="Sex", y="Age", order=['male', 'female'])
# plt.show()

# Create a PairGrid
g = sns.PairGrid(train, hue="Survived", vars=["Age", "Fare", "Pclass"])
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()
plt.show()


In [None]:
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Fare',fill= True)
facet.set(xlim=(0, train['Fare'].max()))
facet.add_legend()
 
plt.show()  

In [None]:
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'Age',fill= True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()

In [None]:
facet = sns.FacetGrid(train, hue="Survived",aspect=4)
facet.map(sns.kdeplot,'F',fill= True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()

In [None]:
# Prepare the test dataset
test['Age'] = test['Age'].fillna(train['Age'].median())
test['Fare'] = test['Fare'].fillna(train['Fare'].median())
test = test.drop(columns='Cabin')
test['Sex'] = test['Sex'].map({'male': 1, 'female': 0})
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)

# Feature Scaling
test[['Age', 'Fare']] = scaler.transform(test[['Age', 'Fare']])

# Feature Engineering
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['IsAlone'] = (test['FamilySize'] == 1).astype(int)

# Define the feature set for the test dataset
X_test = test[['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone']]

# Make predictions on the test dataset
test_predictions = rf_model.predict(X_test)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions
})

# Save the submission to a CSV file
submission.to_csv('titanic_submission.csv', index=False)

print("Testing and submission file creation completed.")