<a href="https://colab.research.google.com/github/josapton/capstone-data-analysis/blob/main/capstone_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# setup environement

install and import libraries

In [None]:
# Core data libs
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix
)

# Display settings
sns.set(style="whitegrid")
pd.set_option("display.max_columns", None)

# load and inspect the dataset

for simplicity, we’ll analyze the titanic dataset via seaborn’s built in loader

In [None]:
df = sns.load_dataset("titanic")
df.head(5)

check dimensions

In [None]:
df.shape

summarize

In [None]:
df.describe(include="all")
df.info()
df.isnull().sum()

# exploratory data analysis (EDA)

distribution of key variables

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.countplot(x="sex", data=df, ax=axes[0])
sns.countplot(x="pclass", hue="survived", data=df, ax=axes[1])
plt.show()

age distribution and survival rate

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df.age.dropna(), kde=True, bins=30)
plt.title("Age Distribution")
plt.show()

sns.boxplot(x="survived", y="age", data=df)
plt.title("Age vs Survival")
plt.show()

correlation heatmap (numerical features)

In [None]:
sns.heatmap(df[["survived","age","fare","sibsp","parch"]].corr(), annot=True)
plt.show()

# data cleaning and preprocessing

select relevant features

In [None]:
data = df[["survived","pclass","sex","age","sibsp","parch","fare","embarked"]].copy()

handles missing values

In [None]:
data['age'] = data['age'].fillna(data['age'].median())
data['embarked'] = data['embarked'].fillna(data['embarked'].mode()[0])

encode categorical variables

In [None]:
data = pd.get_dummies(data, columns=["sex","embarked"], drop_first=True)

split into features/target

In [None]:
X = data.drop("survived", axis=1)
y = data.survived

train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# feature scaling and model training

scale numeric features

In [None]:
scaler = StandardScaler()
num_cols = ["age","sibsp","parch","fare"]
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

train logistic regerssion

In [None]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# model evaluation

make prediction

In [None]:
y_pred = model.predict(X_test)

metrics and reports

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

confusion metrix visualization

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.show()