<a href="https://colab.research.google.com/github/jatingahlyan/Codsoft-datascience/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

.
try:
    df = pd.read_csv('Titanic-Dataset.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'Titanic-Dataset.csv' not found. Please make sure the file is in the correct directory.")
    exit()



print("\n--- Data Preprocessing ---")


features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

# Make a copy to avoid changing the original data
df_processed = df[features + [target]].copy()


df_processed['Age'] = df_processed['Age'].fillna(df_processed['Age'].median())

# For 'Embarked', we'll fill missing values with the most common port of embarkation (the mode).
df_processed['Embarked'] = df_processed['Embarked'].fillna(df_processed['Embarked'].mode()[0])

# Convert categorical features (text) into numerical format.
# The model can only understand numbers.
# Convert 'Sex' to 0 for male and 1 for female.
df_processed['Sex'] = df_processed['Sex'].map({'male': 0, 'female': 1})

# Convert 'Embarked' into numerical columns using one-hot encoding.
# This creates a new column for each port (C, Q, S) with a 1 or 0.
df_processed = pd.get_dummies(df_processed, columns=['Embarked'], drop_first=True)

print("Missing values handled and categorical features converted.")
print("Processed data preview:")
print(df_processed.head())


# --- 3. Define Features (X) and Target (y) ---
# X contains our features (the inputs for the model).
# y contains our target (what we want to predict).
X = df_processed.drop('Survived', axis=1)
y = df_processed['Survived']

# Ensure all feature columns are numeric
X = X.astype(float)



# This helps us understand if the model is actually learning or just memorizing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nData split into training ({len(X_train)} rows) and testing ({len(X_test)} rows) sets.")


# --- 5. Build and Train the Model ---
# We'll use Logistic Regression, a simple and effective model for this kind of classification problem.
print("\n--- Model Training ---")
model = LogisticRegression(max_iter=1000) # max_iter helps the model converge

# Train the model using our training data
model.fit(X_train, y_train)
print("Model training complete!")


# --- 6. Evaluate the Model ---
# Now we'll see how well our model performs on the unseen test data.
print("\n--- Model Evaluation ---")

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Display a more detailed report
print("\nConfusion Matrix:")
# A confusion matrix shows us where the model got it right and where it got it wrong.
# [[True Negative, False Positive], [False Negative, True Positive]]
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
# This report gives us precision, recall, and f1-score, which are more detailed metrics.
print(classification_report(y_test, y_pred))


hypothetical_passenger = pd.DataFrame({
    'Pclass': [3],
    'Sex': [0], # 0 for male
    'Age': [25],
    'SibSp': [0],
    'Parch': [0],
    'Fare': [10],
    'Embarked_Q': [0],
    'Embarked_S': [1]
})

# Ensure the columns are in the same order as the training data
hypothetical_passenger = hypothetical_passenger[X_train.columns]

prediction = model.predict(hypothetical_passenger)
prediction_proba = model.predict_proba(hypothetical_passenger)

survival_status = "Survived" if prediction[0] == 1 else "Did Not Survive"
print(f"\n--- Example Prediction ---")
print(f"Prediction for the hypothetical passenger: {survival_status}")
print(f"Survival Probability: {prediction_proba[0][1]:.2f}")

Dataset loaded successfully!

--- Data Preprocessing ---
Missing values handled and categorical features converted.
Processed data preview:
   Pclass  Sex   Age  SibSp  Parch     Fare  Survived  Embarked_Q  Embarked_S
0       3    0  22.0      1      0   7.2500         0       False        True
1       1    1  38.0      1      0  71.2833         1       False       False
2       3    1  26.0      0      0   7.9250         1       False        True
3       1    1  35.0      1      0  53.1000         1       False        True
4       3    0  35.0      0      0   8.0500         0       False        True

Data split into training (712 rows) and testing (179 rows) sets.

--- Model Training ---
Model training complete!

--- Model Evaluation ---
Model Accuracy: 0.8101

Confusion Matrix:
[[90 15]
 [19 55]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

  