In [1]:
from google.colab import files, drive
uploaded = files.upload()  # choose kaggle.json when prompted

Saving kaggle.json to kaggle.json


In [2]:
import os, shutil, json, pathlib
pathlib.Path("~/.kaggle").expanduser().mkdir(exist_ok=True)
shutil.move("kaggle.json", os.path.expanduser("~/.kaggle/kaggle.json"))
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

!pip -q install kaggle

In [3]:
!kaggle competitions download -c titanic -p /content
!unzip -o /content/titanic.zip -d /content/titanic
!ls -l /content/titanic

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 84.3MB/s]
Archive:  /content/titanic.zip
  inflating: /content/titanic/gender_submission.csv  
  inflating: /content/titanic/test.csv  
  inflating: /content/titanic/train.csv  
total 92
-rw-r--r-- 1 root root  3258 Dec 11  2019 gender_submission.csv
-rw-r--r-- 1 root root 28629 Dec 11  2019 test.csv
-rw-r--r-- 1 root root 61194 Dec 11  2019 train.csv


In [12]:
import pandas as pd

train = pd.read_csv("/content/titanic/train.csv")
test  = pd.read_csv("/content/titanic/test.csv")

print(train.shape, test.shape)
train.head()

(891, 12) (418, 11)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
base_cols = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked", "Name","Ticket","Cabin"]
x = train[base_cols].copy()
y = train["Survived"].astype(int)

#Splitting train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# num_cols = ["Age", "Fare"]
# cat_cols = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]

# numeric = Pipeline([
#     ("imputer", SimpleImputer(strategy="median")),
#     ("scaler", StandardScaler())  # LR benefits a bit from scaling
# ])

# categorical = Pipeline([
#     ("imputer", SimpleImputer(strategy="most_frequent")),
#     ("ohe", OneHotEncoder(handle_unknown="ignore"))
# ])

# prep = ColumnTransformer([
#     ("num", numeric, num_cols),
#     ("cat", categorical, cat_cols)
# ])

# clf = Pipeline([
#     ("prep", prep),
#     ("lr", LogisticRegression(max_iter=500))
# ])

# clf.fit(x_train, y_train)
# pred = clf.predict(x_test)
# acc = accuracy_score(y_test, pred)
# print(f"validation accuracy: {acc:.4f}")

In [7]:
!pip install xgboost



In [16]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
    rare_map = {
        "Mlle":"Miss", "Ms":"Miss", "Mme":"Mrs",
        "Lady":"Royalty","Countess":"Royalty","Sir":"Royalty","Jonkheer":"Royalty",
        "Don":"Royalty","Dona":"Royalty",
        "Capt":"Officer","Col":"Officer","Major":"Officer","Dr":"Officer","Rev":"Officer"
    }
    df["Title"] = df["Title"].replace(rare_map)

    df["FamilySize"] = df["SibSp"].fillna(0) + df["Parch"].fillna(0) + 1
    df["IsAlone"]   = (df["FamilySize"] == 1).astype(int)

    df["TicketGroup"] = df.groupby("Ticket", dropna=False)["Ticket"].transform("count")

    df["CabinKnown"] = df["Cabin"].notna().astype(int)
    df["Deck"] = df["Cabin"].str[0]   # e.g., 'C85' -> 'C'; NaN stays NaN

    return df

fe = FunctionTransformer(add_features, validate=False)

In [18]:
num_cols = ["Age","SibSp","Parch","Fare","Pclass", "FamilySize","TicketGroup","IsAlone","CabinKnown"]
cat_cols = ["Sex","Embarked","Title","Deck"]

# Fills in empty data with median
numeric_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

#Fills in empty data with whatevers most frequent
categorical_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

prep = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop"   # drop raw columns like Name/Ticket/Cabin
)

In [20]:
from xgboost import XGBClassifier

xgb = Pipeline([
    ("fe", fe),
    ("prep", prep),   # reuse the same ColumnTransformer you built
    ("xgb", XGBClassifier(
        n_estimators=800,      # number of trees
        learning_rate=0.05,    # smaller = slower but often better
        max_depth=4,           # tree depth (controls complexity)
        subsample=0.9,         # row subsampling
        colsample_bytree=0.8,  # column subsampling
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="logloss"  # avoids warnings
    ))
])

xgb.fit(x_train, y_train)
pred = xgb.predict(x_test)
acc = accuracy_score(y_test, pred)
print(f"validation accuracy: {acc:.4f}")

validation accuracy: 0.7933


In [21]:
X_test = test[base_cols].copy()
pred = xgb.predict(X_test).astype(int)

sub = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": pred
})

out_path = "/content/submission.csv"
sub.to_csv(out_path, index=False)

print("Submission shape:", sub.shape)                # should be (418, 2) for Titanic
print("Unique predictions:", sorted(sub.Survived.unique().tolist()))  # should be [0, 1]
print(sub.head())

files.download(out_path)

Submission shape: (418, 2)
Unique predictions: [0, 1]
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>