## このファイルのコードに限り、Google Colabで動かすことを前提としています。

In [None]:
# ======================== RF 5-fold CV + 最終モデル保存 完全版 ========================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, confusion_matrix, 
    precision_recall_fscore_support, classification_report
)
import pandas as pd
import numpy as np
from google.colab import files
from joblib import dump, load
import json
import sklearn, sys

# --- ファイルアップロード ---
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)

# --- データ整形 ---
def prepare_stroke_data(df):
    cols = ['now_sheet', 'peak_count', 'interval_1', 'interval_1_5', 'interval_2', 'interval_2_5']
    return df[cols].copy()

features = prepare_stroke_data(data)
labels_sorted = [0, 10, 15, 20, 25]
features = features[features['now_sheet'].isin(labels_sorted)].reset_index(drop=True)

X_cols = ['peak_count', 'interval_1', 'interval_1_5', 'interval_2', 'interval_2_5']
X = features[X_cols]
y = features['now_sheet']

print(f"Data: X={X.shape}, y={y.shape}")
print("Label counts:\n", y.value_counts().sort_index())

# --- 交差検証設定 ---
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- 評価格納 ---
all_confusion_matrices = []
all_precisions, all_recalls, all_f1s, all_accuracies = [], [], [], []

# --- 各fold処理 ---
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='weighted', zero_division=0
    )

    cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
    cm_norm = np.divide(cm, cm.sum(axis=1, keepdims=True),
                        out=np.zeros_like(cm, dtype=float),
                        where=(cm.sum(axis=1, keepdims=True) != 0))

    all_confusion_matrices.append(cm)
    all_accuracies.append(acc)
    all_precisions.append(prec)
    all_recalls.append(rec)
    all_f1s.append(f1)

    pd.DataFrame(cm_norm, index=labels_sorted, columns=labels_sorted)\
      .to_csv(f"recall_matrix_fold_{fold}.csv")

    print(f"\n【Fold {fold}】")
    print(f"Accuracy: {acc:.3f}")
    print(f"Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}")
    print("\nConfusion matrix:")
    print(pd.DataFrame(cm, index=labels_sorted, columns=labels_sorted))
    print("\nRecall matrix (行正規化):")
    print(pd.DataFrame(cm_norm, index=labels_sorted, columns=labels_sorted))

# --- 平均結果 ---
mean_cm = np.mean(all_confusion_matrices, axis=0)
mean_recall_matrix = np.divide(mean_cm, mean_cm.sum(axis=1, keepdims=True),
                               out=np.zeros_like(mean_cm, dtype=float),
                               where=(mean_cm.sum(axis=1, keepdims=True) != 0))

mean_acc = np.mean(all_accuracies)
mean_prec = np.mean(all_precisions)
mean_rec = np.mean(all_recalls)
mean_f1 = np.mean(all_f1s)

print("\n==============================")
print("【5-Fold Cross Validation Mean】")
print(f"Accuracy: {mean_acc:.3f}")
print(f"Precision: {mean_prec:.3f}")
print(f"Recall: {mean_rec:.3f}")
print(f"F1 Score: {mean_f1:.3f}")

mean_cm_df = pd.DataFrame(mean_cm, index=labels_sorted, columns=labels_sorted)
mean_recall_df = pd.DataFrame(mean_recall_matrix, index=labels_sorted, columns=labels_sorted)

print("\nMean Confusion Matrix:")
print(mean_cm_df)
print("\nMean Recall Matrix:")
print(mean_recall_df)

mean_cm_df.to_csv("mean_confusion_matrix.csv")
mean_recall_df.to_csv("mean_recall_matrix.csv")
print("\nCSV exported: mean_confusion_matrix.csv / mean_recall_matrix.csv")

# ================= 最終モデル作成（全データで学習） =================
rf_final = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf_final.fit(X, y)

# --- モデル保存 ---
MODEL_PATH = "rf_model.joblib"
dump(rf_final, MODEL_PATH)

# --- メタ情報（特徴量順・ラベル順・ライブラリver）も保存 ---
META = {
    "feature_order": X_cols,
    "label_order": labels_sorted,
    "sklearn_version": sklearn.__version__,
    "python_version": sys.version,
    "model_type": "RandomForestClassifier",
    "params": rf_final.get_params(),
}
with open("rf_model_meta.json", "w", encoding="utf-8") as f:
    json.dump(META, f, ensure_ascii=False, indent=2)

print(f"\nSaved model to: {MODEL_PATH}")
print("Saved meta to:  rf_model_meta.json")

# ================= ロード & 推論サンプル =================
# 例: 先頭5行で推論してみる
clf = load(MODEL_PATH)
with open("rf_model_meta.json", "r", encoding="utf-8") as f:
    meta = json.load(f)

# 特徴量順を強制（将来のデータでも順序が崩れないように）
X_infer = X[meta["feature_order"]].head(5)
pred = clf.predict(X_infer)
print("\nPrediction sample (first 5 rows):")
print(pd.DataFrame({
    "pred": pred
}, index=X_infer.index))
