In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# pandasとtensorflowのインストール（必要に応じて）
!pip install pandas tensorflow scikit-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import numpy as np
import random as random
import os
import sys
import copy



In [None]:
#乱数の固定化

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    if "torch" in sys.modules:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(42)

In [None]:
# Compasデータセットのダウンロード
!wget https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv

# pandasでデータを読み込む
data = pd.read_csv("compas-scores-two-years.csv")

# データの最初の5行を表示して確認
print(data.head())


--2025-05-18 18:24:00--  https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2546489 (2.4M) [text/plain]
Saving to: ‘compas-scores-two-years.csv’


2025-05-18 18:24:00 (85.5 MB/s) - ‘compas-scores-two-years.csv’ saved [2546489/2546489]

   id                name   first         last compas_screening_date   sex  \
0   1    miguel hernandez  miguel    hernandez            2013-08-14  Male   
1   3         kevon dixon   kevon        dixon            2013-01-27  Male   
2   4            ed philo      ed        philo            2013-04-14  Male   
3   5         marcu brown   marcu        brown            2013-01-13  Male   
4   6  bouthy pierrelouis  bouthy  pierrelouis   

In [None]:
# データ（特徴量）をPandasのデータフレームに変換
df_data = data
# ターゲット（ラベル）を別に保存
target = data['two_year_recid']

#NaNを含むColumn名
nan_columns = df_data.columns[df_data.isnull().any()].tolist()
print("NaNを含む列:", nan_columns)


#欠損値がある行を削除
df_data = data.dropna()
# 不要な列を削除（例：idなど）
df_data = data.drop(columns=['id', 'first','last','start','end','event'], axis=1)
df_data = df_data.drop(nan_columns, axis=1)


df_encoded = pd.get_dummies(df_data)

# ターゲット（ラベル）をOne-Hot Encoding後のデータフレームに追加
df_encoded['target'] = target

X = df_encoded.drop(['two_year_recid','is_recid','is_violent_recid','target'], axis=1)  # 特徴量

y = df_encoded['target'] if 'target' in df_encoded.columns else df_data['target']  # 'target'が存在する場合のみ取得



NaNを含む列: ['days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number', 'c_offense_date', 'c_arrest_date', 'c_days_from_compas', 'c_charge_desc', 'r_case_number', 'r_charge_degree', 'r_days_from_arrest', 'r_offense_date', 'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid', 'vr_case_number', 'vr_charge_degree', 'vr_offense_date', 'vr_charge_desc', 'in_custody', 'out_custody']


In [None]:
# 訓練セットとテストセットに分割
# (課題1)50％をテストデータとして分けた
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y,test_size=0.5, random_state=42)

# 残り50%から25%を検証データにする（全体の15%）
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 0.50, random_state=42)

# データの標準化

scaler = StandardScaler()

# 訓練データに対して平均と標準偏差を計算し、それで変換する
X_train = scaler.fit_transform(X_train)
# 訓練データで計算した基準（平均.標準偏差）を使って検証データとテストデータも同じように整える
X_val = scaler.fit_transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
# NumPy配列からPyTorchのテンソルに変換
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # 二値分類の場合、ラベルはlong型


In [None]:
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)


X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
class CompasDataset:
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [None]:
batch_size = 128
train_dataset = CompasDataset(X_train_tensor, y_train_tensor)
val_dataset = CompasDataset(X_val_tensor, y_val_tensor)
test_dataset = CompasDataset(X_test_tensor, y_test_tensor)
# データローダーの作成
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(train_dataset)

<__main__.CompasDataset object at 0x7c3f038e6350>


In [None]:
class CompasDataset:
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels  # Assign labels directly instead of calling .values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx] # Access elements using indexing

In [None]:
length=len(X.columns)
print(length)

14710


In [None]:
# 訓練に際して、可能であればGPU（cuda）を設定します。GPUが搭載されていない場合はCPUを使用します
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

# modelを定義します
def create_model(trial, input_dim):
    # チューニングするハイパーパラメータ
    hidden_size = trial.suggest_int("hidden_size", 64, 512)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)
    activation_name = trial.suggest_categorical("activation", ["ReLU", "Sigmoid", "Tanh"])

    # 活性化関数の選択
    activation_fn = {
        "ReLU": nn.ReLU(),
        "Sigmoid": nn.Sigmoid(),
        "Tanh": nn.Tanh()
    }[activation_name]

    # モデル定義（Sequentialで簡素に）
    model = nn.Sequential(
        nn.Linear(input_dim, hidden_size),
        activation_fn,
        nn.Dropout(dropout_rate),
        nn.Linear(hidden_size, 2)
    )

    return model.to(device)

def objective(trial):
    model = create_model(trial, input_dim=X_train_tensor.shape[1])
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "SGD"])
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(5):  # 短めにして探索時間を抑える
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = loss_fn(pred, y_batch)
            loss.backward()
            optimizer.step()

    # 検証精度を評価
    model.eval()
    correct = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            correct += (pred.argmax(1) == y_batch).sum().item()
    accuracy = correct / len(val_loader.dataset)
    return accuracy

import optuna


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# 最良のモデルを作成
best_model = create_model(study.best_trial, input_dim=X_train_tensor.shape[1])

[I 2025-05-18 18:24:07,796] A new study created in memory with name: no-name-3a441b92-628e-44e8-be95-df8dfc3026c9


Using cuda device


[I 2025-05-18 18:24:15,353] Trial 0 finished with value: 0.5177383592017738 and parameters: {'hidden_size': 232, 'dropout_rate': 0.41068047119643336, 'activation': 'ReLU', 'optimizer': 'SGD', 'lr': 0.0013198131606321705}. Best is trial 0 with value: 0.5177383592017738.
[I 2025-05-18 18:24:21,966] Trial 1 finished with value: 0.48558758314855877 and parameters: {'hidden_size': 120, 'dropout_rate': 0.4461437067330809, 'activation': 'ReLU', 'optimizer': 'SGD', 'lr': 0.00022371200578435988}. Best is trial 0 with value: 0.5177383592017738.
[I 2025-05-18 18:24:32,333] Trial 2 finished with value: 0.6491130820399114 and parameters: {'hidden_size': 457, 'dropout_rate': 0.3634747346968012, 'activation': 'Sigmoid', 'optimizer': 'Adam', 'lr': 0.00043548136501758304}. Best is trial 2 with value: 0.6491130820399114.
[I 2025-05-18 18:24:36,467] Trial 3 finished with value: 0.6413525498891353 and parameters: {'hidden_size': 285, 'dropout_rate': 0.2797541510337412, 'activation': 'Sigmoid', 'optimizer'

In [None]:
for name, param in best_model.named_parameters():
    print(f"{name}: mean={param.data.mean().item()}, std={param.data.std().item()}")

0.weight: mean=-2.4553304456276237e-07, std=0.00476008141413331
0.bias: mean=-7.246190307341749e-06, std=0.0047655184753239155
3.weight: mean=0.0015891260700300336, std=0.026325831189751625
3.bias: mean=-0.04101016744971275, std=0.0037996647879481316


In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(best_model.parameters(), lr=1e-2)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        # 損失誤差を計算
        pred = model(X)
        #print(pred)
        loss = loss_fn(pred, y)

        # バックプロパゲーション
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    avg_loss, accuracy = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            avg_loss += loss_fn(pred, y).item()
            accuracy += (pred.argmax(1) == y).type(torch.float).sum().item()
    avg_loss /= size
    accuracy /= size
    return avg_loss, accuracy


In [None]:
epochs = 100
patience = 5
best_val_loss = float("inf")
patience_counter = 0

In [None]:

optimizer = torch.optim.Adam(best_model.parameters(), lr=0.001)  # 必要に応じて study.best_params["lr"] を使用

for t in range(epochs):
    train(train_loader, best_model, loss_fn, optimizer)
    val_loss, val_accuracy = test(val_loader, best_model)
    print(f"Val Error: \n Accuracy: {(100*val_accuracy):>0.1f}%, Val loss: {val_loss:>8f} \n")

# 最良のモデルを保存&EarlyStopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0

        best_model_state = copy.deepcopy(best_model.state_dict())
    # 検証損失が改善しなかった回数がpatience回を超えたら学習を終了
    else:
        patience_counter += 1
    if patience_counter >= patience:
      print("Early stopping triggered!")
      break
best_model.load_state_dict(best_model_state)
print("Done!")

# 未知のテストデータでモデル学習、最終的な精度と損失を表示します
test_loss, test_accuracy = test(test_loader, best_model)
print(f"Test Error: \n Accuracy: {(100*test_accuracy):>0.1f}%, Val loss: {test_loss:>8f} \n")


In [None]:
y_pred = best_model(X_test_tensor.to(device)).argmax(1)

In [None]:
# テスト用データで予測値を生成する

# テスト用データで予測した結果のclassfication_reportを表示する
from sklearn.metrics import classification_report

# classfication_reportを表示
print(classification_report(y_test_tensor.cpu(),y_pred.cpu(),digits=3))

# 混同行列を作成してseabornで表示する
from sklearn.metrics import confusion_matrix
import seaborn as sns
# 混同行列を表示
cm = confusion_matrix(y_test_tensor.cpu(),y_pred.cpu())
sns.heatmap(cm, annot=True, cmap='Blues')
