In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn import (
    ensemble,
    preprocessing,
    tree,
)
from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)
from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC,
)
from yellowbrick.model_selection import (
    LearningCurve,
)

df = pd.read_csv("train.csv")
orig_df = df.copy()

print(df.head())
print(df.dtypes)

profile = ProfileReport(df, title="Titanic Dataset Profiling Report")
#profile.to_file("titanic_report.html")
profile.to_notebook_iframe()  # <-- renders report inside Jupyter notebook





   survived  pclass                                             name     sex  \
0         1       1                    Allen, Miss. Elisabeth Walton  female   
1         1       1                   Allison, Master. Hudson Trevor    male   
2         0       1                     Allison, Miss. Helen Loraine  female   
3         0       1  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   
4         1       1                              Anderson, Mr. Harry    male   

     age  sibsp  parch  ticket      fare    cabin embarked  
0  29.00      0      0   24160  211.3375       B5        S  
1   0.92      1      2  113781  151.5500  C22 C26        S  
2   2.00      1      2  113781  151.5500  C22 C26        S  
3  25.00      1      2  113781  151.5500  C22 C26        S  
4  48.00      0      0   19952   26.5500      E12        S  
survived      int64
pclass        int64
name         object
sex          object
age         float64
sibsp         int64
parch         int64
ticket       

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 11/11 [00:00<00:00, 70117.54it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
df.shape

(891, 11)

In [3]:
df.describe().iloc[:, :2]

Unnamed: 0,survived,pclass
count,891.0,891.0
mean,0.383838,2.308642
std,0.486592,0.836071
min,0.0,1.0
25%,0.0,2.0
50%,0.0,3.0
75%,1.0,3.0
max,1.0,3.0


In [4]:
df.isnull().sum()

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64

In [5]:
df.isnull().mean()

survived    0.000000
pclass      0.000000
name        0.000000
sex         0.000000
age         0.198653
sibsp       0.000000
parch       0.000000
ticket      0.000000
fare        0.000000
cabin       0.771044
embarked    0.002245
dtype: float64

In [6]:
df.isnull().sum(axis=1).iloc[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    1
9    0
dtype: int64

In [7]:
mask = df.isnull().any(axis=1)

In [8]:
mask.head()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [9]:
df.sex.value_counts(dropna=False)

sex
male      577
female    314
Name: count, dtype: int64

In [10]:
df.embarked.value_counts(dropna=False)

embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64

In [11]:
name = df.name
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [12]:
df = df.drop(
    columns=[
        "name",
        "ticket",
        "cabin",
    ]
)

In [13]:
df = pd.get_dummies(df)

In [14]:
df.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'sex_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [15]:
df = df.drop(columns="sex_male")

In [16]:
df = pd.get_dummies(df, drop_first=True)

In [17]:
df.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [18]:
# ---------------------------
# 1. Split data
# ---------------------------
y = df["survived"]
X = df.drop(columns="survived")

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ---------------------------
# 2. Numeric columns
# ---------------------------
num_cols = ["pclass", "age", "sibsp", "parch", "fare"]

# ---------------------------
# 3. Fix numeric types
# ---------------------------
X_train[num_cols] = X_train[num_cols].apply(pd.to_numeric, errors="coerce")
X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors="coerce")

# ---------------------------
# 4. Imputation (better than median)
# ---------------------------
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

imputer = impute.IterativeImputer()

X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = imputer.transform(X_test[num_cols])

# ---------------------------
# 5. Scale numeric columns
# ---------------------------
from sklearn.preprocessing import StandardScaler

sca = StandardScaler()

X_train[num_cols] = sca.fit_transform(X_train[num_cols])
X_test[num_cols] = sca.transform(X_test[num_cols])


In [19]:
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "cabin",
        ]
    )

    # Convert categorical variables to dummies
    df = pd.get_dummies(df, drop_first=True)
    
    # Identify numeric columns automatically
    num_cols = df.select_dtypes(include=['number']).columns.tolist()
    
    return df, num_cols

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

def get_train_test_X_y(df, y_col, test_size=0.3):
    # Split
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    # Numeric columns
    num_cols = ["pclass", "age", "sibsp", "parch", "fare"]
    
    # Ensure numeric types
    X_train[num_cols] = X_train[num_cols].apply(pd.to_numeric, errors="coerce")
    X_test[num_cols] = X_test[num_cols].apply(pd.to_numeric, errors="coerce")
    
    # Impute
    imputer = IterativeImputer()
    X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
    X_test[num_cols] = imputer.transform(X_test[num_cols])
    
    # Scale
    scaler = StandardScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])
    
    return X_train, X_test, y_train, y_test


In [21]:
ti_df, std_cols = tweak_titanic(orig_df)

In [22]:
std_cols = "pclass, age", "sibsp", "fare".split(", ")

In [23]:
X_train, X_test, y_train, y_test = get_train_test_X_y(
    ti_df, "survived",
)

In [24]:
from sklearn.dummy import DummyClassifier

In [25]:
bm = DummyClassifier()

In [26]:
bm.fit(X_train, y_train)

In [27]:
bm.score(X_test, y_test)

0.6268656716417911

In [28]:
from sklearn.metrics import confusion_matrix #Tip: In imbalanced datasets like Titanic, always check confusion matrix:
confusion_matrix(y_test, bm.predict(X_test))


array([[168,   0],
       [100,   0]])

In [29]:
from sklearn import metrics

In [30]:
preds = bm.predict(X_test)
print(pd.Series(preds).value_counts())


0    268
Name: count, dtype: int64


In [31]:
metrics.precision_score(y_test, bm.predict(X_test))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.0

In [32]:
X = pd.concat([X_train, X_test])

In [33]:
y = pd.concat([y_train, y_test])

In [34]:
from sklearn import model_selection

In [35]:
from sklearn.dummy import DummyClassifier

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
from sklearn.naive_bayes import GaussianNB

In [40]:
from sklearn.svm import SVC

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
import xgboost