In [None]:
pip install scikit-learn

In [10]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np

In [11]:
try:
    df = pd.read_csv('C:/Users/Akash/OneDrive - Erin.N.Nagarvala Day School/Desktop/jupyter notebook/Titanic-Dataset.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'Titanic-Dataset.csv' not found. Please ensure the file is uploaded.")
    exit()

print("\n--- Original Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Missing Values Before Preprocessing ---")
print(df.isnull().sum())

Dataset loaded successfully!

--- Original Dataset Head ---
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123       

In [12]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [16]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [17]:
print("\n--- Missing Values After Preprocessing ---")
print(df.isnull().sum())


--- Missing Values After Preprocessing ---
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64


In [18]:
le_sex = LabelEncoder()
df['Sex'] = le_sex.fit_transform(df['Sex'])

le_embarked = LabelEncoder()
df['Embarked'] = le_embarked.fit_transform(df['Embarked'])


In [19]:
# Display the preprocessed data head
print("\n--- Preprocessed Dataset Head ---")
print(df.head())
print("\n--- Preprocessed Dataset Info ---")
df.info()


--- Preprocessed Dataset Head ---
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    1  22.0      1      0   7.2500         2
1         1       1    0  38.0      1      0  71.2833         0
2         1       3    0  26.0      0      0   7.9250         2
3         1       1    0  35.0      1      0  53.1000         2
4         0       3    1  35.0      0      0   8.0500         2

--- Preprocessed Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


In [20]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Testing target shape: {y_test.shape}")


Training features shape: (712, 7)
Testing features shape: (179, 7)
Training target shape: (712,)
Testing target shape: (179,)


In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
model = LogisticRegression(random_state=42, solver='liblinear') # 'liblinear' solver is good for small datasets
model.fit(X_train_scaled, y_train)

print("\n--- Model Training Complete ---")

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))


--- Model Training Complete ---

Model Accuracy: 0.8101

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [25]:
new_passenger_data = pd.DataFrame([[1, 0, 30, 0, 0, 70, 2]],
                                  columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
new_passenger_scaled = scaler.transform(new_passenger_data)
prediction = model.predict(new_passenger_scaled)
prediction_proba = model.predict_proba(new_passenger_scaled) # Probability of each class


print("\n--- Example Prediction for a New Passenger ---")
print(f"New Passenger Data:\n{new_passenger_data}")
print(f"Predicted Survival (0=No, 1=Yes): {prediction[0]}")
print(f"Probability of Not Surviving (0): {prediction_proba[0][0]:.4f}")
print(f"Probability of Surviving (1): {prediction_proba[0][1]:.4f}")

if prediction[0] == 1:
    print("The model predicts this passenger **survived**.")
else:
    print("The model predicts this passenger **did not survive**.")



--- Example Prediction for a New Passenger ---
New Passenger Data:
   Pclass  Sex  Age  SibSp  Parch  Fare  Embarked
0       1    0   30      0      0    70         2
Predicted Survival (0=No, 1=Yes): 1
Probability of Not Surviving (0): 0.0727
Probability of Surviving (1): 0.9273
The model predicts this passenger **survived**.
