In [None]:
import json
import joblib 
import pandas as pd
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [9]:
# 1) 元CSVの「素の列」を読み込み
train_raw = pd.read_csv("titanic_train.csv") 
train_raw = train_raw.drop(columns=["fare"])
train_raw.head(5)

Unnamed: 0,id,survived,pclass,sex,age,sibsp,parch,embarked
0,3,1,1,female,35.0,1,0,S
1,4,0,3,male,35.0,0,0,S
2,7,0,3,male,2.0,3,1,S
3,9,1,2,female,14.0,1,0,C
4,11,1,1,female,58.0,0,0,S


In [10]:
# 2) 学習に使う列と目的変数
target = "survived"
# Kaggle Titanic 想定: pclass, sex, age, sibsp, parch, fare, embarked 
# idのような列があればここで除外
id_like_cols = [c for c in train_raw.columns if c.lower() in ("id","passengerid")]
feature_cols = [c for c in train_raw.columns if c not in id_like_cols + [target]]   

In [11]:
# 3) 列を数値 / カテゴリに分ける（自動推定でもOKだが、明示が安全）
cat_cols = [c for c in feature_cols if train_raw[c].dtype == "object"]
num_cols = [c for c in feature_cols if c not in cat_cols]

In [12]:
# 4) 前処理：Notebookの方針を再現
# 数値:平均で補完 → PowerTransformer(Yeo-Johnson, standardize=True) でスケーリング
# カテゴリ:最頻値で補完 → One-Hot（未知カテゴリは無視）
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("power",   PowerTransformer(standardize=True))
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe",     OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)


In [13]:
# 5) 分類器（SVC: linear）。必要なら class_weight や C を調整
clf = SVC(kernel="linear", probability=True, random_state=42)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf",  clf)
])


In [14]:
# 6) 学習（簡易にホールドアウト）
X = train_raw[feature_cols]
y = train_raw[target]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipe.fit(X_tr, y_tr)
print("Train score:", pipe.score(X_tr, y_tr))
print("Test  score:",  pipe.score(X_te, y_te))

Train score: 0.797752808988764
Test  score: 0.7640449438202247


In [15]:
# 7) 保存（前処理ごと）
joblib.dump(pipe, "model.pkl")
print("Saved model.pkl")


Saved model.pkl


In [16]:
# 8) メタ情報も保存（将来の互換性・可観測性のため）
meta = {
    "sklearn_version": sklearn.__version__,
    "task": "classification",
    "feature_cols": feature_cols,   # 元の“生”の列名
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "target": target,
    "estimator": "SVC(kernel='linear', probability=True)"
}
with open("model_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print("Saved model_meta.json")
print(sklearn.__version__)

Saved model_meta.json
1.7.2
