# Titanic Survival Prediction
This notebook performs data preprocessing, model training, and evaluation on the Titanic dataset.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib


In [None]:
train_df = pd.read_csv('C:\\Users\\Hardik\\Desktop\\titanic-survivor-prediction\\train.csv')
test_df = pd.read_csv('C:\\Users\\Hardik\\Desktop\\titanic-survivor-prediction\\test.csv')


In [None]:
full_df = pd.concat([train_df, test_df], sort=False)
full_df['Age'].fillna(full_df['Age'].median(), inplace=True)
full_df['Fare'].fillna(full_df['Fare'].median(), inplace=True)
full_df['Embarked'].fillna(full_df['Embarked'].mode()[0], inplace=True)
full_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

le = LabelEncoder()
full_df['Sex'] = le.fit_transform(full_df['Sex'])
full_df['Embarked'] = le.fit_transform(full_df['Embarked'])

train_clean = full_df[:len(train_df)]
test_clean = full_df[len(train_df):]


In [None]:
X = train_clean.drop(['Survived', 'PassengerId'], axis=1)
y = train_clean['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


In [None]:
joblib.dump(model, 'model.pkl')
print("Model saved as model.pkl")


In [None]:

from pathlib import Path
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Paths
train_path = "train.csv"
test_path = "test.csv"
output_dir = Path("visualizations")
output_dir.mkdir(parents=True, exist_ok=True)

# Load data
train_df = pd.read_csv(train_path)

# Preprocess training data
train_df['Sex'] = train_df['Sex'].map({'male': 1, 'female': 0})
train_df['Embarked'].fillna('S', inplace=True)
train_df['Embarked'] = LabelEncoder().fit_transform(train_df['Embarked'])
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = train_df[features]
y = train_df['Survived']

# Load trained model
rf = joblib.load("model.pkl")

# ---- Batch prediction ----
test_df = pd.read_csv(test_path)
test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})
test_df['Embarked'].fillna('S', inplace=True)
test_df['Embarked'] = LabelEncoder().fit_transform(test_df['Embarked'])
test_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

X_test = test_df[features]
batch_predictions = rf.predict(X_test)

# Save submission file
submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": batch_predictions
})
submission.to_csv("submission.csv", index=False)
print("‚úÖ Batch predictions saved to submission.csv")

# ---- Confusion Matrix ----
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred_val = rf.predict(X_val)
cm = confusion_matrix(y_val, y_pred_val)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Confusion Matrix (Random Forest)")
plt.savefig(output_dir / "confusion_matrix.png")
plt.close()

# ---- ROC Curve ----
y_proba = rf.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, y_proba)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"Random Forest (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.savefig(output_dir / "roc_curve.png")
plt.close()

# ---- Survival Count ----
train_df['Survived'].value_counts().plot(kind='bar')
plt.title("Survival Count")
plt.xlabel("Survived")
plt.ylabel("Count")
plt.savefig(output_dir / "survival_count.png")
plt.close()

# ---- Age Distribution ----
train_df['Age'].hist(bins=20)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.savefig(output_dir / "age_distribution.png")
plt.close()

print("‚úÖ Visualizations saved in 'visualizations' folder")


In [None]:
import pandas as pd
import joblib

# 1Ô∏è‚É£ Load trained model
rf = joblib.load("model.pkl")  # or "random_forest_model.pkl"

# 2Ô∏è‚É£ Load test.csv
test_df = pd.read_csv("test.csv")

# 3Ô∏è‚É£ Apply the same preprocessing as in training
# Example preprocessing ‚Äî change based on what you did in training
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)
test_df['Embarked'].fillna('S', inplace=True)
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Select the same features used in training
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_test = test_df[features]

# 4Ô∏è‚É£ Make predictions
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]  # Probability of survival

# 5Ô∏è‚É£ Save Kaggle-ready CSV
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred
})
submission.to_csv("submission.csv", index=False)

# 6Ô∏è‚É£ Save Creative Fun CSV
fun_submission = submission.copy()
fun_submission['Survival_Status'] = fun_submission['Survived'].map({1: 'Survived ‚úÖ', 0: 'Not Survived ‚ùå'})
fun_submission['Message'] = fun_submission['Survived'].map({
    1: "You‚Äôre a survivor! üéâ",
    0: "Better luck next time üö¢üí¶"
})
fun_submission['Confidence_Score'] = y_proba.round(2)

fun_submission.to_csv("submission_fun.csv", index=False)

print("‚úÖ submission.csv (Kaggle-ready) and submission_fun.csv (creative) have been saved!")
