In [5]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.51.0


In [10]:
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# -------------------------
# Page config
# -------------------------
st.set_page_config(page_title="Titanic Survival Predictor", page_icon="üö¢", layout="centered")
st.title("Titanic Survival Predictor")
st.write("This app trains a model from the public Titanic dataset (from seaborn) and lets you predict survival for a passenger.")

# -------------------------
# Data loading & preprocessing
# -------------------------
@st.cache_data
def load_raw_data():
    # Allow user to upload their own CSV file
    uploaded_file = st.sidebar.file_uploader("Upload your CSV dataset", type=["csv"])

    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file)
        st.sidebar.success("Dataset uploaded successfully!")
    else:
        # Fallback to seaborn's titanic dataset if no file is uploaded
        df = sns.load_dataset("titanic")
        st.sidebar.info("No dataset uploaded. Using Titanic dataset as default.")
    return df

@st.cache_data
def preprocess_and_split(df, target_column, features, test_size=0.2, random_state=42):
    df = df.copy()

    # Ensure target column exists
    if target_column not in df.columns:
        st.error(f"Target column '{target_column}' not found in the dataset.")
        return None, None, None, None, None, None

    # Drop rows where the target column is missing
    df.dropna(subset=[target_column], inplace=True)

    # Select only the specified features and target
    df = df[features + [target_column]]

    # Identify numeric and categorical features dynamically
    numeric_features = df[features].select_dtypes(include=np.number).columns.tolist()
    categorical_features = df[features].select_dtypes(exclude=np.number).columns.tolist()

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features)
        ],
        remainder="drop"
    )

    X = df[features]
    y = df[target_column].astype(int)

    # Fit-transform the preprocessor on full data (we will also apply it to user inputs)
    X_transformed = preprocessor.fit_transform(X)

    # Build feature names for the transformed array
    ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
    cat_cols = list(ohe.get_feature_names_out(categorical_features))
    feature_names = numeric_features + cat_cols

    X_transformed = pd.DataFrame(X_transformed, columns=feature_names)

    X_train, X_test, y_train, y_test = train_test_split(
        X_transformed, y, test_size=test_size, random_state=random_state, stratify=y
    )

    return preprocessor, feature_names, X_train, X_test, y_train, y_test

# -------------------------
# Model training (cached)
# -------------------------
@st.cache_resource
def train_model(_preprocessor, X_train, X_test, y_train, y_test, model_choice="RandomForest", random_state=42):
    if model_choice == "RandomForest":
        model = RandomForestClassifier(n_estimators=200, random_state=random_state)
    else:
        model = LogisticRegression(max_iter=1000, random_state=random_state)

    # Model is trained on already transformed data (X_train is a DataFrame)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    report = classification_report(y_test, y_pred_test, output_dict=True)

    return model, train_acc, test_acc, report

# -------------------------
# Run data pipeline
# -------------------------
raw_df = load_raw_data()

# Sidebar for feature selection
st.sidebar.header("Data Columns")
all_columns = raw_df.columns.tolist()
target_column = st.sidebar.selectbox("Select Target Column", all_columns, index=all_columns.index('survived') if 'survived' in all_columns else 0)
feature_columns = st.sidebar.multiselect("Select Feature Columns", all_columns, default=[col for col in ['class', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'] if col in all_columns])

if not feature_columns:
    st.warning("Please select at least one feature column.")
    st.stop()

preprocessor, feature_names, X_train, X_test, y_train, y_test = preprocess_and_split(raw_df, target_column, feature_columns)

if preprocessor is None: # Handle case where target column not found
    st.stop()

# Sidebar controls
st.sidebar.header("Training & Model Options")
model_choice = st.sidebar.selectbox("Model", ["RandomForest", "LogisticRegression"])
retrain = st.sidebar.button("Retrain model (force)")

# Train model (cached unless retrain pressed)
if retrain:
    # clear cache for train_model by calling with a different key: we simulate retrain by not using cache_resource result
    model, train_acc, test_acc, report = train_model.__wrapped__(preprocessor, X_train, X_test, y_train, y_test, model_choice)
else:
    model, train_acc, test_acc, report = train_model(preprocessor, X_train, X_test, y_train, y_test, model_choice)

# -------------------------
# Show training summary
# -------------------------
st.subheader("Model training summary")
st.write(f"Model type: **{model_choice}**")
st.write(f"Training accuracy: **{train_acc:.3f}**")
st.write(f"Test accuracy: **{test_acc:.3f}**")

st.write("Classification report (test set):")
report_df = pd.DataFrame(report).transpose()
# Small cleanup before display
report_df = report_df.drop(columns=[col for col in report_df.columns if isinstance(col, str) and col.startswith("macro")], errors="ignore")
st.dataframe(report_df.style.format("{:.3f}"))

# Optional: confusion matrix
st.write("Confusion matrix (test set):")
cm = confusion_matrix(y_test, model.predict(X_test))
cm_df = pd.DataFrame(cm, index=["true_neg(0)", "true_pos(1)"], columns=["pred_neg(0)", "pred_pos(1)"])
st.table(cm_df)

st.markdown("---")

# -------------------------
# Prediction UI
# -------------------------
st.subheader("Make a prediction")
st.write("Enter passenger information and press Predict.")

col1, col2 = st.columns(2)

# Dynamically generate input fields based on selected features
input_data = {}
numeric_features_raw = raw_df[feature_columns].select_dtypes(include=np.number).columns.tolist()
categorical_features_raw = raw_df[feature_columns].select_dtypes(exclude=np.number).columns.tolist()

for i, feature in enumerate(feature_columns):
    if feature in numeric_features_raw:
        with (col1 if i % 2 == 0 else col2):
            min_val = raw_df[feature].min() if pd.notna(raw_df[feature].min()) else 0.0
            max_val = raw_df[feature].max() if pd.notna(raw_df[feature].max()) else 1000.0
            default_val = raw_df[feature].median() if pd.notna(raw_df[feature].median()) else 0.0
            input_data[feature] = st.number_input(feature.replace('_', ' ').title(), min_value=float(min_val), max_value=float(max_val), value=float(default_val), step=1.0)
    elif feature in categorical_features_raw:
        with (col1 if i % 2 == 0 else col2):
            options = raw_df[feature].unique().tolist()
            options = [str(o) for o in options if pd.notna(o)] # convert to string and filter out NaN
            if options:
                input_data[feature] = st.selectbox(feature.replace('_', ' ').title(), options)
            else:
                input_data[feature] = st.text_input(feature.replace('_', ' ').title(), value="") # Fallback for empty categorical


# Convert the user inputs into a DataFrame matching the preprocessor's expected raw columns
raw_input = pd.DataFrame([input_data])

# Apply the same preprocessor that was fit on the original data
X_input_transformed = preprocessor.transform(raw_input)
X_input_df = pd.DataFrame(X_input_transformed, columns=feature_names)

if st.button("Predict"):
    pred = model.predict(X_input_df)[0]
    pred_proba = None
    if hasattr(model, "predict_proba"):
        # Ensure we get the probability for the predicted class (1 for survived)
        pred_proba_array = model.predict_proba(X_input_df)[0]
        pred_proba = pred_proba_array[1] if pred == 1 else pred_proba_array[0]

    if pred == 1:
        st.success(f"Prediction: Passenger likely SURVIVED. (prob ~ {pred_proba:.2f})" if pred_proba is not None else "Prediction: Passenger likely SURVIVED.")
    else:
        st.error(f"Prediction: Passenger likely DID NOT SURVIVE. (prob ~ {pred_proba:.2f})" if pred_proba is not None else "Prediction: Passenger likely DID NOT SURVIVE.")

st.markdown("---")
st.caption("Model trained on provided or seaborn's Titanic dataset. This is for demo and educational use.")

2025-11-07 02:28:00.259 No runtime found, using MemoryCacheStorageManager
2025-11-07 02:28:00.262 No runtime found, using MemoryCacheStorageManager
2025-11-07 02:28:00.265 No runtime found, using MemoryCacheStorageManager
2025-11-07 02:28:00.314 No runtime found, using MemoryCacheStorageManager


DeltaGenerator()