In [None]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [None]:
our_data = pd.read_csv("/Users/hoengbird/Downloads/pybaseball_data_cleaned_v2.csv")
our_data.head()

In [None]:
raw_data = pd.read_parquet("/Users/hoengbird/Downloads/pybaseball_data.parquet")

In [None]:
raw_data.columns

In [None]:
filtered_data = raw_data[['pitch_name', 'release_speed', 'release_pos_x', 'release_pos_z',\
                        'pitcher', 'batter', 'zone', 'balls', 'strikes', 'pfx_x', 'pfx_z',\
                        'plate_x', 'plate_z', 'outs_when_up', 'at_bat_number', 'pitch_number',\
                        'post_bat_score', 'post_away_score', 'home_team', 'away_team', 'game_date',\
                        'inning_topbot', 'inning', 'on_1b', 'on_2b', 'on_3b', 'type']]

In [None]:
filtered_data = filtered_data.dropna(subset=['pitch_name'])

In [None]:
# 타순 계산 함수
def calculate_batting_order(group):

    home_order_list = []
    away_order_list = []

    current_home_order = 0
    current_away_order = 0
    prev_top_at_bat = 0  # 이전 팀의 공격(at_bat_number 기준)

    # 이닝별 처리
    for _, row in group.iterrows():
        if row['inning_topbot'] == 'Top':  # 어웨이 팀 공격
            current_away_order = (current_away_order % 9) + 1
            away_order_list.append(current_away_order)
            home_order_list.append(0)
            prev_top_at_bat = row['at_bat_number']  # 어웨이팀 at_bat_number 업데이트
        elif row['inning_topbot'] == 'Bot':  # 홈 팀 공격
            # 현재 at_bat_number에서 이전 어웨이팀 at_bat_number를 빼고 계산
            relative_at_bat = row['at_bat_number'] - prev_top_at_bat
            current_home_order = (relative_at_bat % 9) + 1
            home_order_list.append(current_home_order)
            away_order_list.append(0)

    group['away_order'] = away_order_list
    group['home_order'] = home_order_list
    group['order'] = group['away_order'] + group['home_order']
    return group

grouped = filtered_data.groupby(['game_date', 'home_team', 'away_team'], group_keys=False)
progress_bar = tqdm(grouped, total=len(grouped), desc="Processing Groups")

result = pd.concat([calculate_batting_order(group) for _, group in progress_bar])

In [None]:
merge_ = result[['inning', 'outs_when_up', 'release_speed', 'release_pos_x', 'release_pos_z', 'order', 'type']]
data = our_data.merge(merge_, on=['inning', 'outs_when_up', 'release_speed', 'release_pos_x', 'release_pos_z'], how='left')

# order 문제 처리 완료될 경우 본 코드는 삭제함
data.dropna(subset=['order'], inplace=True)

In [None]:
dnn_dataset = data[['inning', 'on_1b_1', 'on_2b_1', 'on_3b_1', 'balls', 'strikes', 'outs_when_up', 'winning', 'losing', 'tied',\
                        'stand_R', 'zone', 'pitch_name', 'order', 'type']]

In [None]:
# dnn_dataset.loc[:, 'on_1b_1'] = dnn_dataset.loc[:, 'on_1b_1'].notnull().astype(int)
# dnn_dataset.loc[:, 'on_1b_1'] = dnn_dataset.loc[:, 'on_2b_1'].notnull().astype(int)
# dnn_dataset.loc[:, 'on_1b_1'] = dnn_dataset.loc[:, 'on_3b_1'].notnull().astype(int)

In [None]:
dnn_dataset = dnn_dataset.rename(columns={'on_1b_1':'Base1', 'on_2b_1':'Base2', 'on_3b_1':'Base3', 'balls':'Ball', 'strikes':'Strike', 'stand_R':'LR', 'outs_when_up':'Out', 'order':'Order', 'inning': 'InnNum'})

In [None]:
# strike, ball 구분
## 타격(X)일 경우 strike 처리
dnn_dataset['T_Strike'] = dnn_dataset['type'].apply(
    lambda x: 1 if x in ['X', 'S'] else 0
)

dnn_dataset['T_Ball'] = dnn_dataset['type'].apply(
    lambda x: 1 if x=='B' else 0
)

In [None]:
# target pitch type processing
dnn_dataset['Fastball'] = dnn_dataset['pitch_name'].apply(
    lambda x: 1 if x in ['4-Seam Fastball', 'Sinker'] else 0
)

dnn_dataset['Nonfastball'] = dnn_dataset['pitch_name'].apply(
    lambda x: 1 if x not in ['4-Seam Fastball', 'Sinker'] else 0
)

In [None]:
# target location processing
print(dnn_dataset['zone'].unique())
print("전체 투구: ", len(dnn_dataset))
dnn_dataset = dnn_dataset[dnn_dataset['zone'] < 10]
print("스트라이크 존만 필터링: ", len(dnn_dataset))

dnn_dataset.loc[:, 'horizontal'] = dnn_dataset['zone'].apply(
    lambda x: 'Left' if x in [1,4,7] else ('Center' if x in [2,5,8] else 'Right')
)

dnn_dataset.loc[:, 'vertical'] = dnn_dataset['zone'].apply(
    lambda x: 'Up' if x in [1,2,3] else ('Middle' if x in [4,5,6] else 'Down')
)

dnn_dataset.loc[:, 'horizontal_Left'] = (dnn_dataset['horizontal']=='Left').astype(int)
dnn_dataset.loc[:, 'horizontal_Center'] = (dnn_dataset['horizontal']=='Center').astype(int)
dnn_dataset.loc[:, 'horizontal_Right'] = (dnn_dataset['horizontal']=='Right').astype(int)

dnn_dataset.loc[:, 'vertical_Up'] = (dnn_dataset['vertical']=='Up').astype(int)
dnn_dataset.loc[:, 'vertical_Middle'] = (dnn_dataset['vertical']=='Middle').astype(int)
dnn_dataset.loc[:, 'vertical_Down'] = (dnn_dataset['vertical']=='Down').astype(int)

In [None]:
# 좌타 우타 처리
dnn_dataset['LR'] = dnn_dataset['LR'].apply(lambda x: 2 if x == 1 else 1)

In [None]:
# 경기 상황 처리
dnn_dataset['LDW'] = dnn_dataset.apply(lambda row: 3 if row['winning'] == 1 else (2 if row['tied'] == 1 else 1), axis=1)

In [None]:
# 불필요 열 정리
dnn_dataset = dnn_dataset.drop(columns=['winning', 'losing', 'tied', 'horizontal', 'vertical', 'zone', 'pitch_name', 'type'])
dnn_dataset

In [None]:
def e2dnn_transform(row, columns):
    mapping = {
        (1, 0, 0): [1, 0],
        (0, 1, 0): [0, 1],
        (0, 0, 1): [1, 1]
    }
    one_hot_tuple = tuple(row[columns].values)
    return mapping.get(one_hot_tuple, [None, None])

In [None]:
horizontal_columns = ["horizontal_Left", "horizontal_Center", "horizontal_Right"]
dnn_dataset[["H1", "H2"]] = dnn_dataset.apply(e2dnn_transform, axis=1, columns=horizontal_columns).apply(pd.Series)

vertical_columns = ["vertical_Up", "vertical_Middle", "vertical_Down"]
dnn_dataset[["V1", "V2"]] = dnn_dataset.apply(e2dnn_transform, axis=1, columns=vertical_columns).apply(pd.Series)

In [None]:
dnn_dataset.to_parquet('dnn_dataset.parquet')

## 모델 학습

In [None]:
dnn_dataset = pd.read_parquet("C:/Users/gangmin/Documents/카카오톡 받은 파일/dnn_dataset.parquet")

In [None]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
input_cols = ["InnNum", "LDW", "Base1", "Base2", "Base3", 'Ball', "Strike", "Out", 'Order', "LR"]
target_cols = ["T_Strike", "Fastball", "H1", "H2", "V1", "V2"]

X = dnn_dataset[input_cols].values
y = dnn_dataset[target_cols].values

In [None]:
class PitchDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = torch.tensor(inputs, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [None]:
n_splits = 10
splitter = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.4)

splits = []
for train_index, temp_index in splitter.split(X, np.argmax(y, axis=1)):
    temp_X, temp_y = X[temp_index], y[temp_index]
    
    val_test_split = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
    for val_index, test_index in val_test_split.split(temp_X, np.argmax(temp_y, axis=1)):
        train_X, train_y = X[train_index], y[train_index]
        val_X, val_y = temp_X[val_index], temp_y[val_index]
        test_X, test_y = temp_X[test_index], temp_y[test_index]
        
        splits.append((train_X, train_y, val_X, val_y, test_X, test_y))

In [None]:
class BSFNF_DNN(nn.Module):
    def __init__(self):
        super(BSFNF_DNN, self).__init__()
        self.fc1 = nn.Linear(10, 15)
        self.fc2 = nn.Linear(15, 20)
        self.fc3 = nn.Linear(20, 25)
        self.fc4 = nn.Linear(25, 30)
        self.fc5 = nn.Linear(30, 35)
        self.fc6 = nn.Linear(35, 30)
        self.fc7 = nn.Linear(30, 25)
        self.fc8 = nn.Linear(25, 15)
        self.fc9 = nn.Linear(15, 8)
        self.output = nn.Linear(8, 2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = F.relu(self.fc9(x))
        x = torch.sigmoid(self.output(x))
        return x

class HL_DNN(nn.Module):
    def __init__(self):
        super(HL_DNN, self).__init__()
        self.fc1 = nn.Linear(10, 15)
        self.fc2 = nn.Linear(15, 20)
        self.fc3 = nn.Linear(20, 30)
        self.fc4 = nn.Linear(30, 30)
        self.fc5 = nn.Linear(30, 20)
        self.fc6 = nn.Linear(20, 15)
        self.fc7 = nn.Linear(15, 8)
        self.output = nn.Linear(8, 2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = torch.sigmoid(self.output(x))
        return x

class VL_DNN(nn.Module):
    def __init__(self):
        super(VL_DNN, self).__init__()
        self.fc1 = nn.Linear(10, 15)
        self.fc2 = nn.Linear(15, 20)
        self.fc3 = nn.Linear(20, 25)
        self.fc4 = nn.Linear(25, 30)
        self.fc5 = nn.Linear(30, 35)
        self.fc6 = nn.Linear(35, 40)
        self.fc7 = nn.Linear(40, 40)
        self.fc8 = nn.Linear(40, 35)
        self.fc9 = nn.Linear(35, 30)
        self.fc10 = nn.Linear(30, 25)
        self.fc11 = nn.Linear(25, 15)
        self.fc12 = nn.Linear(15, 8)
        self.output = nn.Linear(8, 2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = F.relu(self.fc9(x))
        x = F.relu(self.fc10(x))
        x = F.relu(self.fc11(x))
        x = F.relu(self.fc12(x))
        x = torch.sigmoid(self.output(x))
        return x

class E3_DNN(nn.Module):
    def __init__(self):
        super(E3_DNN, self).__init__()
        self.bsfnf = BSFNF_DNN()
        self.hl = HL_DNN()
        self.vl = VL_DNN()
    
    def forward(self, x):
        bsf_nf_output = self.bsfnf(x)
        hl_output = self.hl(x)
        vl_output = self.vl(x)
        
        final_output = torch.cat((bsf_nf_output, hl_output, vl_output), dim=1)
        return final_output

In [None]:
def encode_targets(targets):
    # BSFNF_DNN: T_Strike, Fastball
    bsf_targets = targets[:, :2]

    # HL_DNN: H1, H2
    hl_targets = targets[:, 2:4]

    # VL_DNN: V1, V2
    vl_targets = targets[:, 4:]

    return bsf_targets, hl_targets, vl_targets

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

In [None]:
criterion_bsf = nn.BCELoss()
criterion_hl = nn.BCELoss()
criterion_vl = nn.BCELoss()

def compute_loss(outputs, targets):
    bsf_output = outputs[:, :2]
    hl_output = outputs[:, 2:4]
    vl_output = outputs[:, 4:]

    bsf_targets, hl_targets, vl_targets = encode_targets(targets)

    loss_bsf = criterion_bsf(bsf_output, bsf_targets)
    loss_hl = criterion_hl(hl_output, hl_targets)
    loss_vl = criterion_vl(vl_output, vl_targets)

    return loss_bsf + loss_hl + loss_vl

In [None]:
def train_model(train_loader, val_loader, model, optimizer, num_epochs=1000, patience=3):
    best_loss = float('inf')
    early_stop_count = 0
    model = model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training", leave=True, dynamic_ncols=True, smoothing=0.1) as train_bar:
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = compute_loss(outputs, targets)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
                train_bar.update(1)
            
            model.eval()
            val_loss = 0.0
            
            with torch.no_grad():
                with tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation", leave=True, dynamic_ncols=True, smoothing=0.1) as val_bar:
                    for inputs, targets in val_loader:
                        inputs, targets = inputs.to(device), targets.to(device)
                        outputs = model(inputs)
                        val_loss += compute_loss(outputs, targets).item()
                        val_bar.update(1)
            
            val_loss /= len(val_loader)

            if val_loss < best_loss:
                best_loss = val_loss
                early_stop_count = 0
            else:
                early_stop_count += 1

            if early_stop_count >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

            print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader)}, Val Loss: {val_loss}")

In [None]:
def evaluate_model(test_loader, model):
    total_predictions = 0
    correct_predictions = {"BSFNF": 0, "HL": 0, "VL": 0, "Overall": 0}

    model.eval()
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            bsf_output, hl_output, vl_output = outputs[:, :2], outputs[:, 2:4], outputs[:, 4:]
            bsf_targets, hl_targets, vl_targets = encode_targets(targets)

            bsf_pred = (bsf_output > 0.5).float()
            hl_pred = (hl_output > 0.5).float()
            vl_pred = (vl_output > 0.5).float()

            correct_predictions["BSFNF"] += (bsf_pred == bsf_targets).all(dim=1).sum().item()
            correct_predictions["HL"] += (hl_pred == hl_targets).all(dim=1).sum().item()
            correct_predictions["VL"] += (vl_pred == vl_targets).all(dim=1).sum().item()

            overall_correct = (
                (bsf_pred == bsf_targets).all(dim=1) &
                (hl_pred == hl_targets).all(dim=1) &
                (vl_pred == vl_targets).all(dim=1)
            ).sum().item()

            correct_predictions['Overall'] += overall_correct
            total_predictions += targets.size(0)

    bsf_accuracy = correct_predictions['BSFNF'] / total_predictions
    hl_accuracy = correct_predictions['HL'] / total_predictions
    vl_accuracy = correct_predictions["VL"] / total_predictions
    overall_accuracy = correct_predictions['Overall'] / total_predictions

    print(f"BSFNF Accuracy: {bsf_accuracy:.4f}")
    print(f"HL Accuracy: {hl_accuracy:.4f}")
    print(f"VL Accuracy: {vl_accuracy:.4f}")
    print(f"E3DNN Accuracy:, {overall_accuracy:.4f}")
    
    return {
        "BSFNF Accuracy": bsf_accuracy,
        "HL Accuracy": hl_accuracy,
        "VL Accuracy": vl_accuracy,
        "Overall Accuracy": overall_accuracy
    }

In [None]:
results = []

for split_idx, split in enumerate(splits, 1):
    print(f"\nProcessing Split {split_idx}/{len(splits)}")

    train_X, train_y, val_X, val_y, test_X, test_y = split
    
    train_loader = DataLoader(PitchDataset(train_X, train_y), batch_size=32, shuffle=True)
    val_loader = DataLoader(PitchDataset(val_X, val_y), batch_size=16, shuffle=False)
    test_loader = DataLoader(PitchDataset(test_X, test_y), batch_size=16, shuffle=False)

    # 모델 초기화
    model = E3_DNN()
    model.apply(init_weights)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # 학습
    train_model(train_loader, val_loader, model, optimizer)

    # 테스트
    accuracies = evaluate_model(test_loader, model)
    results.append(accuracies)

bsfnf_avg = np.mean([r["BSFNF Accuracy"] for r in results])
hl_avg = np.mean([r["HL Accuracy"] for r in results])
vl_avg = np.mean([r["VL Accuracy"] for r in results])
overall_avg = np.mean([r["Overall Accuracy"] for r in results])

print("\n=== split-wise Results ===")
for split_idx, accuracies in enumerate(results, 1):
    print(f"Split {split_idx}:")
    print(f"BSFNF Accuracy: {accuracies['BSFNF Accuracy']:.4f}")
    print(f"HL Accuracy: {accuracies['HL Accuracy']:.4f}")
    print(f"VL Accuracy: {accuracies['VL Accuracy']:.4f}")
    print(f"Overall Accuracy: {accuracies['Overall Accuracy']:.4f}")

print("\n=== Final Results ===")
print(f"Average BSFNF Accuracy: {bsfnf_avg:.4f}")
print(f"Average HL Accuracy: {hl_avg:.4f}")
print(f"Average VL Accuracy: {vl_avg:.4f}")
print(f"Average Overall Accuracy: {overall_avg:.4f}")