In [7]:
# 必要ライブラリのインポート
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [8]:
# config取得
base_dir = Path.cwd().parent.parent
config_path = base_dir / "config"
sys.path.append(str(config_path))

from config import (
    RANDOM_SEED,
    CLASS_PRIORS,          # {"H":0.3,"E":0.3,"D":0.4} など
    FEATURE_RANGES,        # レンジ定義（クラス×特徴）
    NOISE,                 # 特徴ごとのノイズσなど
    CORRELATION,           # 相関注入の係数
)

In [9]:
# csvデータの取得
base_dir = Path(os.getcwd())
common_dir = base_dir.parent.parent / "assets" / "csv_data" 
input_path = common_dir / "主要被害公園エリア_座標結合済み.csv"
df = pd.read_csv(input_path)
df["緯度"] = np.floor(df["緯度"] * 1e4) / 1e4
df["経度"] = np.floor(df["経度"] * 1e4) / 1e4
# df["grid_id"] = df["緯度"].map(lambda x: f"{x:.4f}") + "_" + df["経度"].map(lambda x: f"{x:.4f}")
display(df)


Unnamed: 0,park_name,経度,緯度
0,桜ヶ丘公園,139.4567,35.6390
1,桜ヶ丘公園,139.4567,35.6387
2,桜ヶ丘公園,139.4571,35.6403
3,桜ヶ丘公園,139.4571,35.6400
4,桜ヶ丘公園,139.4571,35.6398
...,...,...,...
5941,八王子霊園,139.2762,35.6599
5942,八王子霊園,139.2764,35.6607
5943,八王子霊園,139.2764,35.6601
5944,八王子霊園,139.2764,35.6599


In [10]:
# 特徴量
features = [
    "green_ratio",
    "yellow_brown_ratio",
    "color_std",
    "hue_mean",
    "ndvi_avg",
    "ndvi_std",
    "ndvi_min",
    "ndvi_max",
    "leaf_temp_mean",
]

# 比率特徴量
ratio_like = [
    "green_ratio",
    "yellow_brown_ratio",
    "color_std",
    "ndvi_avg",
    "ndvi_std",
    "ndvi_min",
    "ndvi_max"
]

# ランダムシード
rng = np.random.default_rng(RANDOM_SEED)

# 正解ラベルをサンプリング
labels = rng.choice(list(CLASS_PRIORS.keys()), size=len(df), p=list(CLASS_PRIORS.values()))
df["label"] = labels

# クラスごとの行マスク
mask = {c: (df["label"] == c).values for c in CLASS_PRIORS.keys()}

# 指定レンジの一様分布から乱数を作る
def sample_uniform(low, high, size):
    return rng.uniform(low, high, size)

# ガウスノイズ（平均0、標準偏差σ）を足す
def add_gaussian_noise(x, sigma):
    if sigma is None or sigma == 0:
        return x
    return x + rng.normal(0, sigma, size=x.shape)

# 比率系の値を [0,1] に切り詰め
def clip01(x): 
    return np.clip(x, 0.0, 1.0)

# 特徴量の初期化
vals = {f: np.empty(len(df)) for f in features}

# 各特徴量にノイズで生成
for f in features:
    for c in CLASS_PRIORS.keys():
        low, high = FEATURE_RANGES[c][f]
        vals[f][mask[c]] = sample_uniform(low, high, mask[c].sum())
    sigma = NOISE.get(f, 0.0)
    vals[f] = add_gaussian_noise(vals[f], sigma)

# 物理的制約でクリップ（比率系は0–1、角度/温度は範囲内）
for f in ratio_like:
    vals[f] = clip01(vals[f])
vals["hue_mean"] = np.mod(vals["hue_mean"], 180.0)
vals["leaf_temp_mean"] = np.clip(vals["leaf_temp_mean"], 14.0, 45.0)

# 軽い相関注入
corr_cfg = CORRELATION or {}
if "yellow_brown_vs_ndvi" in corr_cfg:
    a = corr_cfg["yellow_brown_vs_ndvi"]
    vals["yellow_brown_ratio"] += a * np.clip(0.6 - vals["ndvi_avg"], 0, None)
    vals["yellow_brown_ratio"] = clip01(vals["yellow_brown_ratio"])
if "leaf_temp_vs_ndvi" in corr_cfg:
    b = corr_cfg["leaf_temp_vs_ndvi"]
    vals["leaf_temp_mean"] += b * np.clip(0.6 - vals["ndvi_avg"], 0, None)
    vals["leaf_temp_mean"] = np.clip(vals["leaf_temp_mean"], 15.0, 50.0)

# NDVI と 葉温 に追加の大きめを注入。
spill_p = NOISE.get("spillover_rate", 0.10)  # 10%など
spill_idx = rng.choice(len(df), size=int(spill_p * len(df)), replace=False)
if spill_idx.size > 0:
    if "ndvi_avg_spill" in NOISE:
        vals["ndvi_avg"][spill_idx] = add_gaussian_noise(vals["ndvi_avg"][spill_idx], NOISE["ndvi_avg_spill"])
        vals["ndvi_avg"] = clip01(vals["ndvi_avg"])
    if "leaf_temp_mean_spill" in NOISE:
        vals["leaf_temp_mean"][spill_idx] = add_gaussian_noise(vals["leaf_temp_mean"][spill_idx], NOISE["leaf_temp_mean_spill"])
        vals["leaf_temp_mean"] = np.clip(vals["leaf_temp_mean"], 15.0, 50.0)

# dfに格納
for f in features:
    df[f] = vals[f]

display(df)


Unnamed: 0,park_name,経度,緯度,label,green_ratio,yellow_brown_ratio,color_std,hue_mean,ndvi_avg,ndvi_std,ndvi_min,ndvi_max,leaf_temp_mean
0,桜ヶ丘公園,139.4567,35.6390,E,0.459612,0.785722,0.192519,59.384835,0.304189,0.248984,0.238098,0.880689,33.298713
1,桜ヶ丘公園,139.4567,35.6387,H,0.479500,0.084902,0.086805,60.375179,0.630559,0.364002,0.609107,0.797351,27.397116
2,桜ヶ丘公園,139.4571,35.6403,D,0.485491,0.753272,0.220034,63.004569,0.591053,0.000000,0.097239,0.907153,31.642257
3,桜ヶ丘公園,139.4571,35.6400,E,0.400513,0.568641,0.125061,83.833420,0.595624,0.000000,0.706339,0.779235,30.533073
4,桜ヶ丘公園,139.4571,35.6398,H,0.585060,0.250759,0.072748,97.396637,0.753502,0.156121,0.539966,1.000000,33.048419
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5941,八王子霊園,139.2762,35.6599,E,0.348541,0.377199,0.196278,91.826972,0.618802,0.087845,0.192817,0.760627,33.987499
5942,八王子霊園,139.2764,35.6607,H,0.690633,0.110610,0.098884,88.994973,0.858774,0.150090,0.947386,0.826191,29.898037
5943,八王子霊園,139.2764,35.6601,D,0.204938,0.827446,0.190168,73.427794,0.208723,0.235991,0.323419,0.568565,42.396146
5944,八王子霊園,139.2764,35.6599,H,0.641633,0.114654,0.203125,97.612755,0.907921,0.000000,0.671873,1.000000,31.278356


In [11]:
# 元ラベル退避
if "latent_label" not in df.columns and "label" in df.columns:
    df.rename(columns={"label": "latent_label"}, inplace=True)

# 正規化
def norm01(x, lo, hi):
    return np.clip((x - lo) / (hi - lo), 0.0, 1.0)

def midpref(x, lo, hi):
    """lo〜hiの“真ん中”に近いほど1.0、端に行くほど0.0"""
    z = norm01(x, lo, hi)
    return 1.0 - np.clip(np.abs(z - 0.5) * 2.0, 0.0, 1.0)

# 説明変数
X_cols = [
    "green_ratio",
    "yellow_brown_ratio",
    "color_std",
    "hue_mean",
    "ndvi_avg",
    "ndvi_std",
    "ndvi_min",
    "ndvi_max",
    "leaf_temp_mean"
]
X = df[X_cols].to_numpy()
y = df["latent_label"].to_numpy()

# 標準化してロジスティック回帰
scaler = StandardScaler()
Xz = scaler.fit_transform(X)
clf = LogisticRegression(multi_class="ovr", max_iter=500, random_state=0)
clf.fit(Xz, y)
classes = clf.classes_
coef = clf.coef_

# 係数→重み（負は0にして単調性担保 → 正の寄与で正規化）
def to_weights(beta):
    beta = np.maximum(beta, 0.0)
    s = beta.sum()
    if s == 0:
        return np.ones_like(beta) / len(beta)
    return beta / s

# 重みをクラスごとに辞書化
weights = {cls: to_weights(coef[i]) for i, cls in enumerate(classes)}
for cls in ["H", "E", "D"]:
    if cls not in weights:
        weights[cls] = np.ones(len(X_cols)) / len(X_cols)

# 特徴量のインデックス
idx = {c: i for i, c in enumerate(X_cols)}

# ロジスティック由来の重みで H/E/D スコアを構築
wH = weights["H"]
H_score = (
    wH[idx["ndvi_avg"]]           *      norm01(df["ndvi_avg"], 0.50, 0.90) +
    wH[idx["yellow_brown_ratio"]] * (1 - norm01(df["yellow_brown_ratio"], 0.10, 0.60)) +
    wH[idx["leaf_temp_mean"]]     * (1 - norm01(df["leaf_temp_mean"], 26, 38)) +
    wH[idx["green_ratio"]]        *      norm01(df["green_ratio"], 0.40, 0.85) +
    wH[idx["ndvi_std"]]           * (1 - norm01(df["ndvi_std"], 0.05, 0.18)) +
    wH[idx["color_std"]]          * (1 - norm01(df["color_std"], 0.08, 0.20)) +
    wH[idx["hue_mean"]]           *      midpref(df["hue_mean"], 80, 140) +     # 緑寄り中心を好む
    wH[idx["ndvi_min"]]           *      norm01(df["ndvi_min"], 0.40, 0.78) +
    wH[idx["ndvi_max"]]           *      norm01(df["ndvi_max"], 0.80, 0.96)
)

wD = weights["D"]
D_score = (
    wD[idx["ndvi_avg"]]           * (1 - norm01(df["ndvi_avg"], 0.20, 0.70)) +
    wD[idx["leaf_temp_mean"]]     *      norm01(df["leaf_temp_mean"], 28, 40) +
    wD[idx["yellow_brown_ratio"]] *      norm01(df["yellow_brown_ratio"], 0.15, 0.80) +
    wD[idx["green_ratio"]]        * (1 - norm01(df["green_ratio"], 0.20, 0.70)) +
    wD[idx["ndvi_std"]]           *      norm01(df["ndvi_std"], 0.05, 0.18) +
    wD[idx["color_std"]]          *      norm01(df["color_std"], 0.08, 0.20) +
    wD[idx["hue_mean"]]           *      midpref(df["hue_mean"], 35, 90) +      # 黄〜茶中心を好む
    wD[idx["ndvi_min"]]           * (1 - norm01(df["ndvi_min"], 0.18, 0.60)) +
    wD[idx["ndvi_max"]]           * (1 - norm01(df["ndvi_max"], 0.62, 0.90))
)

wE = weights["E"]
E_score = (
    wE[idx["ndvi_avg"]]           *      midpref(df["ndvi_avg"], 0.35, 0.80) +  # 中間好み
    wE[idx["ndvi_std"]]           *      norm01(df["ndvi_std"], 0.05, 0.18) +
    wE[idx["yellow_brown_ratio"]] *      norm01(df["yellow_brown_ratio"], 0.10, 0.45) +
    wE[idx["leaf_temp_mean"]]     *      midpref(df["leaf_temp_mean"], 26, 33) +
    wE[idx["green_ratio"]]        *      midpref(df["green_ratio"], 0.35, 0.70) +
    wE[idx["color_std"]]          *      norm01(df["color_std"], 0.10, 0.22) +
    wE[idx["hue_mean"]]           *      midpref(df["hue_mean"], 70, 115) +
    wE[idx["ndvi_min"]]           *      midpref(df["ndvi_min"], 0.20, 0.55) +
    wE[idx["ndvi_max"]]           *      midpref(df["ndvi_max"], 0.60, 0.90)
)

# 最終ラベル
scores = np.vstack([H_score, E_score, D_score]).T
labels_pred = np.array(["H", "E", "D"])[np.argmax(scores, axis=1)]
df["label"] = labels_pred
df.drop(columns=["latent_label"], errors='ignore', inplace=True)
display(df)



Unnamed: 0,park_name,経度,緯度,green_ratio,yellow_brown_ratio,color_std,hue_mean,ndvi_avg,ndvi_std,ndvi_min,ndvi_max,leaf_temp_mean,label
0,桜ヶ丘公園,139.4567,35.6390,0.459612,0.785722,0.192519,59.384835,0.304189,0.248984,0.238098,0.880689,33.298713,E
1,桜ヶ丘公園,139.4567,35.6387,0.479500,0.084902,0.086805,60.375179,0.630559,0.364002,0.609107,0.797351,27.397116,E
2,桜ヶ丘公園,139.4571,35.6403,0.485491,0.753272,0.220034,63.004569,0.591053,0.000000,0.097239,0.907153,31.642257,D
3,桜ヶ丘公園,139.4571,35.6400,0.400513,0.568641,0.125061,83.833420,0.595624,0.000000,0.706339,0.779235,30.533073,D
4,桜ヶ丘公園,139.4571,35.6398,0.585060,0.250759,0.072748,97.396637,0.753502,0.156121,0.539966,1.000000,33.048419,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5941,八王子霊園,139.2762,35.6599,0.348541,0.377199,0.196278,91.826972,0.618802,0.087845,0.192817,0.760627,33.987499,E
5942,八王子霊園,139.2764,35.6607,0.690633,0.110610,0.098884,88.994973,0.858774,0.150090,0.947386,0.826191,29.898037,H
5943,八王子霊園,139.2764,35.6601,0.204938,0.827446,0.190168,73.427794,0.208723,0.235991,0.323419,0.568565,42.396146,D
5944,八王子霊園,139.2764,35.6599,0.641633,0.114654,0.203125,97.612755,0.907921,0.000000,0.671873,1.000000,31.278356,H


In [12]:
# CSV出力
df = pd.DataFrame(df)
output_path = common_dir / "没：主要被害公園エリア_乱数値結合済み.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"CSVを出力完了: {output_path}")

CSVを出力完了: c:\Users\kyous\OneDrive\デスクトップ\ハッカソン\tokyo-tree-doctor_program\tokyo-tree-doctor\ml\assets\csv_data\没：主要被害公園エリア_乱数値結合済み.csv
