In [30]:
# train.py
import os, joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [5]:
import pandas as pd

In [19]:
df = pd.read_csv("smoke/data/train.csv")
df

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159251,159251,40,155,45,69.0,1.5,2.0,1,1,127,...,72,159,14.5,1,0.8,25,26,13,0,0
159252,159252,50,155,75,82.0,1.0,1.0,1,1,120,...,64,108,14.5,1,0.6,21,20,18,0,0
159253,159253,40,160,50,66.0,1.5,1.0,1,1,114,...,87,93,10.9,1,0.6,15,9,12,0,0
159254,159254,50,165,75,92.0,1.2,1.0,1,1,121,...,55,80,14.4,1,1.1,22,17,37,0,1


In [23]:
df.shape

(159256, 24)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  int64  
 1   age                  159256 non-null  int64  
 2   height(cm)           159256 non-null  int64  
 3   weight(kg)           159256 non-null  int64  
 4   waist(cm)            159256 non-null  float64
 5   eyesight(left)       159256 non-null  float64
 6   eyesight(right)      159256 non-null  float64
 7   hearing(left)        159256 non-null  int64  
 8   hearing(right)       159256 non-null  int64  
 9   systolic             159256 non-null  int64  
 10  relaxation           159256 non-null  int64  
 11  fasting blood sugar  159256 non-null  int64  
 12  Cholesterol          159256 non-null  int64  
 13  triglyceride         159256 non-null  int64  
 14  HDL                  159256 non-null  int64  
 15  LDL              

In [21]:
y = df["smoking"]
X = df.drop(columns=["smoking"])

In [26]:
X

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,300,40,75,16.5,1,1.0,22,25,27,0
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,55,57,126,16.2,1,1.1,27,23,37,1
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,197,45,93,17.4,1,0.8,27,31,53,0
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,203,38,102,15.9,1,1.0,20,27,30,1
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,87,44,93,15.4,1,0.8,19,13,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159251,159251,40,155,45,69.0,1.5,2.0,1,1,127,...,47,72,159,14.5,1,0.8,25,26,13,0
159252,159252,50,155,75,82.0,1.0,1.0,1,1,120,...,202,64,108,14.5,1,0.6,21,20,18,0
159253,159253,40,160,50,66.0,1.5,1.0,1,1,114,...,45,87,93,10.9,1,0.6,15,9,12,0
159254,159254,50,165,75,92.0,1.2,1.0,1,1,121,...,148,55,80,14.4,1,1.1,22,17,37,0


In [27]:
# 文字列ラベルなら 0/1 に変換
if y.dtype == bool:
    y = y.astype(int)
elif y.dtype.kind not in "iu":  # 整数/ブールじゃないとき
    classes = y.dropna().unique().tolist()
    assert len(classes) == 2, f"ラベルが2値じゃない: {classes}"
    y = y.map({classes[0]: 0, classes[1]: 1}).astype(int)


In [28]:
# 3. 数値列とカテゴリ列に分ける
num_cols = X.select_dtypes(include="number").columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

In [31]:
# 4. 前処理パイプライン
pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),
])

In [32]:
# 5. モデル
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
pipe = Pipeline([("pre", pre), ("clf", clf)])

In [34]:
# 6. train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [35]:
# 7. 学習
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [36]:
# 8. 評価
proba = pipe.predict_proba(X_test)[:, 1]
pred  = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print("AUC     :", roc_auc_score(y_test, proba))


Accuracy: 0.749748838377496
AUC     : 0.8296042484227719


In [37]:
# 9. 保存
os.makedirs("models", exist_ok=True)
joblib.dump(pipe, "models/model.joblib")
print("モデルを保存しました -> models/model.joblib")

モデルを保存しました -> models/model.joblib
