In [3]:
from pathlib import Path
import pandas as pd
from IPython.display import display

# Locate a single CSV in ../data relative to this notebook
DATA_DIR = Path("../data")
csv_files = sorted(DATA_DIR.glob("*.csv"))
assert len(csv_files) == 1, f"Expected exactly 1 CSV in {DATA_DIR}, found {len(csv_files)}."

data_path = csv_files[0]
print("Using file:", data_path.name)

# Load it
df = pd.read_csv(data_path)

# Basic peek
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

# Detect if this file includes the training labels
HAS_TARGET = "Survived" in df.columns
print("HAS_TARGET (Survived present?):", HAS_TARGET)


Using file: Titanic-Dataset.csv
Shape: (891, 12)
Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


HAS_TARGET (Survived present?): True


In [4]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=["Survived"])   # all columns except target
y = df["Survived"]

print("Features shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))


Features shape: (891, 11)
Target distribution:
 Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [5]:
# Drop 'Cabin' if it exists (too many missing)
if "Cabin" in X.columns:
    X = X.drop(columns=["Cabin"])

# Fill Age with median
if "Age" in X.columns:
    X["Age"] = X["Age"].fillna(X["Age"].median())

# Fill Embarked with mode
if "Embarked" in X.columns:
    X["Embarked"] = X["Embarked"].fillna(X["Embarked"].mode()[0])

# One-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

print("Final feature set shape:", X.shape)
X.head()


Final feature set shape: (891, 1579)


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel","Name_Abelson, Mrs. Samuel (Hannah Wizosky)",...,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,2,1,38.0,1,0,71.2833,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,3,26.0,0,0,7.925,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,4,1,35.0,1,0,53.1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,5,3,35.0,0,0,8.05,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])


Train size: 712 Test size: 179


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.8100558659217877

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.90      0.85       110
           1       0.81      0.67      0.73        69

    accuracy                           0.81       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.81      0.81       179



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
import joblib, pathlib

pathlib.Path("../models").mkdir(exist_ok=True)
joblib.dump(model, "../models/logreg_baseline.joblib")

print("Model saved to ../models/logreg_baseline.joblib")


Model saved to ../models/logreg_baseline.joblib
