In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

import math
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

data_folder = "./data/"

#Get and load data
# pitch_14_17_file = "pitcher_2014_2017.csv"
bball_data_2_file = "Baseball Data-2.csv"

In [4]:
class PitchingDataset(Dataset): #TODO set this up to work for our dataset
#TODO modify so this so it will return a tuple [train_set, test_set, validation_set]
    """Face Landmarks dataset."""

    #defining prepprocessing functions

    #preprocessing function used to calculate the plate location and isStrike classifier (this will be different than PitchCall (even in terms of BallCalled vs StrikeCalled))
    #returns tuple: classification, isStrike
    #classification: heart (strike) = 0, shadow (strike) = 1, shadow (ball) = 2, chase (ball) = 3, waste (ball) = 4
    def PlateZone(self, PlateLocHeight, PlateLocSide):
      foot = 12

      #waste outside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
      if ((PlateLocHeight > 7 or PlateLocHeight < 0.5) and (PlateLocSide < -(20/foot) or PlateLocSide > (20/foot))):
        return 4

      #heart inside 38in to 22in vertical, -6.7in to 6.7in horizontal, (strike zone size * 67%)
      if ((PlateLocHeight < (38/foot) and PlateLocHeight > (22/foot)) and (PlateLocSide > (-6.7/foot) and PlateLocSide < (6.7/foot))):
        return 0

      #strike zone inside 42in to 18in vertical, -10in to 10in horizontal
      if (PlateLocHeight < (42/foot) and PlateLocHeight > (18/foot) and (PlateLocSide > (-10/foot) and PlateLocSide < (10/foot))):
        return 1

      #shadow inside 46in to 14in vertical, -13.3in to 13.3in horizontal, (strike zone size * 133%)
      if (PlateLocHeight < (46/foot) and PlateLocHeight > (14/foot) and (PlateLocSide > (-13.3/foot) and PlateLocSide < (13.3/foot))):
        return 2

      #chase inside 84in to 6in, -20in to 20in horizontal, (strike zone * 200%)
      return 3

    #preprocessing function used to generate a single number that will be used to classify the ball/strike count before the current pitch 
    #returns int [0 - 11]
    def PitchCount(self, balls, strikes):
    # Strikes: 0  1   2
    # Balls v|---------- 
    #       0| 0  1   2 
    #       1| 3  4   5
    #       2| 6  7   8
    #       3| 9  10  11
      if(balls == 0):
        if(strikes == 0):
          return 0
        if (strikes == 1):
          return 1
        return 2
      if(balls == 1):
        if(strikes == 0):
          return 3
        if (strikes == 1):
          return 4
        return 5
      if(balls == 2):
        if(strikes == 0):
          return 6
        if (strikes == 1):
          return 7
        return 8
      if(balls == 3):
        if(strikes == 0):
          return 9
        if (strikes == 1):
          return 10
        return 11

    #this preprocessing function generates the ground truth hitability of a pitch
    #these values will definitely need to be adjusted and should probably be hyperparameters
    def GenerateGroundTruthLabels(self, pitchCall):
      if pitchCall == 'BallCalled':
        return -1
      if pitchCall == 'BallIntentional' or pitchCall == 'HitByPitch':
        return 0
      if pitchCall == 'StrikeSwinging' or pitchCall == 'StrikeCalled':
        return 1
      else:
        return 2

    def __init__(self, csv_file, root_dir, train, validation):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with data.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        #import the columns we will need for training and preprocessing
        self.root_dir = root_dir
        self.bball_data = pd.read_csv(self.root_dir+csv_file, usecols=['Pitcher', 'PitcherThrows', 'Batter', 'BatterSide', 'PitchCall', 'KorBB', 'PlayResult', 'RunsScored', 'VertBreak', 'HorzBreak', 'ZoneSpeed', 'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'PlateLocHeight', 'PlateLocSide', 'Balls', 'Strikes', 'TaggedPitchType'])
        #run preprocessing functions
        self.bball_data['Zone'] = self.bball_data.apply(lambda pitch : self.PlateZone(pitch['PlateLocHeight'], pitch['PlateLocSide']), axis=1)
        self.bball_data['BallStrikeNum'] = self.bball_data.apply(lambda pitch : self.PitchCount(pitch['Balls'], pitch['Strikes']), axis=1)
        self.bball_data['GroundTruth'] = self.bball_data.apply(lambda pitch : self.GenerateGroundTruthLabels(pitch['PitchCall']), axis=1)

        #drop all features we no longer need (ones only used for preprocessing)
        self.bball_data.drop(labels=['PlateLocHeight', 'PlateLocSide', 'Balls', 'Strikes'], axis=1)


        categorical_columns = ['Pitcher', 'Batter', 'PitchCall', 'KorBB', 'PlayResult', 'Zone', 'BallStrikeNum']

        for category in categorical_columns:
          self.bball_data[category] = self.bball_data[category].astype('category')

        #print the labels of the features we will use to train our network
        print(self.bball_data.keys())

        #splitting data into training, validation, testing
        total_samples = len(self.bball_data.index)
        training_samples = math.floor(0.6*total_samples)
        validation_samples = math.ceil(0.2*total_samples)
        testing_samples = math.ceil(0.4*total_samples) #TODO change this back to 0.2 once validation implemented

        sum = training_samples+testing_samples#+validation_samples

        print("total samples:",total_samples,
              "\ntraining samples:",training_samples,
              #"\nvalidation samples:",validation_samples,
              "\ntesting samples:",testing_samples,
              "\nsum of training, validation, and test samples:",sum)

        #makes shuffled version of the data
        indices = np.arange(total_samples)
        np.random.shuffle(indices) #TODO get ground truth for data sets using these indices
        shuffled_bball_data = self.bball_data.reindex(indices).reset_index()

        #gets the amount of random data points as determined by set proportion
        if train:
          training_data = shuffled_bball_data.iloc[0:training_samples]
          self.bball_data = training_data
          print("training")
        elif validation:
          validation_data = shuffled_bball_data.iloc[training_samples:training_samples+validation_samples]
          self.bball_data = validation_data
          print("validation")
        else:
          testing_data = shuffled_bball_data.iloc[training_samples+validation_samples:training_samples+validation_samples+testing_samples]
          self.bball_data = testing_data
          print("testing")

    def __len__(self):
        return len(self.bball_data)

    def __getitem__(self, idx): #TODO make this get the next data point
       # if torch.is_tensor(idx):
        #    idx = idx.tolist()
        #taken from this https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
        # img_name = os.path.join(self.root_dir,
        #                         self.landmarks_frame.iloc[idx, 0])
        # image = io.imread(img_name)
        # landmarks = self.landmarks_frame.iloc[idx, 1:]
        # landmarks = np.array([landmarks])
        # landmarks = landmarks.astype('float').reshape(-1, 2)

        num_data = len(self.bball_data)
        num_features = len(self.bball_data.columns) - 2
        ground_truth_index = num_features + 1

        #print(num_data)
        #print(num_features)
        #print(ground_truth_index)

        all_data = self.bball_data.iloc[[idx]]

        #print(all_data)
        #print(all_data.shape)

        ground_truth = all_data.pop('GroundTruth')
        predictive_data = all_data

        #print(ground_truth)
        #print(predictive_data)

        sample = {'predictive_data': predictive_data, 'ground_truth': ground_truth}

        #print(sample)

        return sample

    #will return a dataframe containing all rows which have the specified key value in the feature column
    def getAllWithFeature(self, feature, key):
        possible_features = ['Pitcher', 'PitcherThrows', 'Batter', 'BatterSide', 'TaggedPitchType']

        if feature not in possible_features:
          return None
        
        pitches = self.bball_data.loc[self.bball_data[feature] == key]

        return pitches

In [5]:
test_data = PitchingDataset(bball_data_2_file,data_folder,train=False,validation=False)
train_data = PitchingDataset(bball_data_2_file,data_folder,train=True,validation=False)
ballsCalled = test_data.getAllWithFeature('BatterSide', 'Right')

print(len(ballsCalled))

FileNotFoundError: [Errno 2] No such file or directory: './dataBaseball Data-2.csv'

In [None]:
#This is an implementation of the soft decision tree model
#https://github.com/kimhc6028/soft-decision-tree/blob/master/model.py
import os
import time

import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


class InnerNode():

    def __init__(self, depth, args):
        self.args = args
        self.fc = nn.Linear(self.args.input_dim, 1)
        beta = torch.randn(1)
        #beta = beta.expand((self.args.batch_size, 1))
        if self.args.cuda:
            beta = beta.cuda()
        self.beta = nn.Parameter(beta)
        self.leaf = False
        self.prob = None
        self.leaf_accumulator = []
        self.lmbda = self.args.lmbda * 2 ** (-depth)
        self.build_child(depth)
        self.penalties = []

    def reset(self):
        self.leaf_accumulator = []
        self.penalties = []
        self.left.reset()
        self.right.reset()

    def build_child(self, depth):
        if depth < self.args.max_depth:
            self.left = InnerNode(depth+1, self.args)
            self.right = InnerNode(depth+1, self.args)
        else :
            self.left = LeafNode(self.args)
            self.right = LeafNode(self.args)

    def forward(self, x):
        return(F.sigmoid(self.beta*self.fc(x)))
    
    def select_next(self, x):
        prob = self.forward(x)
        if prob < 0.5:
            return(self.left, prob)
        else:
            return(self.right, prob)

    def cal_prob(self, x, path_prob):
        self.prob = self.forward(x) #probability of selecting right node
        self.path_prob = path_prob
        left_leaf_accumulator = self.left.cal_prob(x, path_prob * (1-self.prob))
        right_leaf_accumulator = self.right.cal_prob(x, path_prob * self.prob)
        self.leaf_accumulator.extend(left_leaf_accumulator)
        self.leaf_accumulator.extend(right_leaf_accumulator)
        return(self.leaf_accumulator)

    def get_penalty(self):
        penalty = (torch.sum(self.prob * self.path_prob) / torch.sum(self.path_prob), self.lmbda)
        if not self.left.leaf:
            left_penalty = self.left.get_penalty()
            right_penalty = self.right.get_penalty()
            self.penalties.append(penalty)
            self.penalties.extend(left_penalty)
            self.penalties.extend(right_penalty)
        return(self.penalties)


class LeafNode():
    def __init__(self, args):
        self.args = args
        self.param = torch.randn(self.args.output_dim)
        if self.args.cuda:
            self.param = self.param.cuda()
        self.param = nn.Parameter(self.param)
        self.leaf = True
        self.softmax = nn.Softmax()

    def forward(self):
        return(self.softmax(self.param.view(1,-1)))

    def reset(self):
        pass

    def cal_prob(self, x, path_prob):
        Q = self.forward()
        #Q = Q.expand((self.args.batch_size, self.args.output_dim))
        Q = Q.expand((path_prob.size()[0], self.args.output_dim))
        return([[path_prob, Q]])


class SoftDecisionTree(nn.Module):

    def __init__(self, args):
        super(SoftDecisionTree, self).__init__()
        self.args = args
        self.root = InnerNode(1, self.args)
        self.collect_parameters() ##collect parameters and modules under root node
        self.optimizer = optim.SGD(self.parameters(), lr=self.args.lr, momentum=self.args.momentum)
        self.test_acc = []
        self.define_extras(self.args.batch_size)
        self.best_accuracy = 0.0

    def define_extras(self, batch_size):
        ##define target_onehot and path_prob_init batch size, because these need to be defined according to batch size, which can be differ
        self.target_onehot = torch.FloatTensor(batch_size, self.args.output_dim)
        self.target_onehot = Variable(self.target_onehot)
        self.path_prob_init = Variable(torch.ones(batch_size, 1))
        if self.args.cuda:
            self.target_onehot = self.target_onehot.cuda()
            self.path_prob_init = self.path_prob_init.cuda()
    '''
    def forward(self, x):
        node = self.root
        path_prob = Variable(torch.ones(self.args.batch_size, 1))
        while not node.leaf:
            node, prob = node.select_next(x)
            path_prob *= prob
        return node()
    '''        
    def cal_loss(self, x, y):
        batch_size = y.size()[0]
        leaf_accumulator = self.root.cal_prob(x, self.path_prob_init)
        loss = 0.
        max_prob = [-1. for _ in range(batch_size)]
        max_Q = [torch.zeros(self.args.output_dim) for _ in range(batch_size)]
        for (path_prob, Q) in leaf_accumulator:
            TQ = torch.bmm(y.view(batch_size, 1, self.args.output_dim), torch.log(Q).view(batch_size, self.args.output_dim, 1)).view(-1,1)
            loss += path_prob * TQ
            path_prob_numpy = path_prob.cpu().data.numpy().reshape(-1)
            for i in range(batch_size):
                if max_prob[i] < path_prob_numpy[i]:
                    max_prob[i] = path_prob_numpy[i]
                    max_Q[i] = Q[i]
        loss = loss.mean()
        penalties = self.root.get_penalty()
        C = 0.
        for (penalty, lmbda) in penalties:
            C -= lmbda * 0.5 *(torch.log(penalty) + torch.log(1-penalty))
        output = torch.stack(max_Q)
        self.root.reset() ##reset all stacked calculation
        return(-loss + C, output) ## -log(loss) will always output non, because loss is always below zero. I suspect this is the mistake of the paper?

    def collect_parameters(self):
        nodes = [self.root]
        self.module_list = nn.ModuleList()
        self.param_list = nn.ParameterList()
        while nodes:
            node = nodes.pop(0)
            if node.leaf:
                param = node.param
                self.param_list.append(param)
            else:
                fc = node.fc
                beta = node.beta
                nodes.append(node.right)
                nodes.append(node.left)
                self.param_list.append(beta)
                self.module_list.append(fc)

    def train_(self, train_loader, epoch):
        self.train()
        self.define_extras(self.args.batch_size)
        for batch_idx, (data, target) in enumerate(train_loader):
            correct = 0
            if self.args.cuda:
                data, target = data.cuda(), target.cuda()
            #data = data.view(self.args.batch_size,-1)
            target = Variable(target)
            target_ = target.view(-1,1)
            batch_size = target_.size()[0]
            data = data.view(batch_size,-1)
            ##convert int target to one-hot vector
            data = Variable(data)
            if not batch_size == self.args.batch_size: #because we have to initialize parameters for batch_size, tensor not matches with batch size cannot be trained
                self.define_extras(batch_size)
            self.target_onehot.data.zero_()            
            self.target_onehot.scatter_(1, target_, 1.)
            self.optimizer.zero_grad()
            print(data.size())
            loss, output = self.cal_loss(data, self.target_onehot)
            #loss.backward(retain_variables=True)
            loss.backward()
            self.optimizer.step()
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum()
            accuracy = 100. * correct / len(data)

            if batch_idx % self.args.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, Accuracy: {}/{} ({:.4f}%)'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item(),
                    correct, len(data),
                    accuracy))

    def test_(self, test_loader, epoch):
        self.eval()
        self.define_extras(self.args.batch_size)
        test_loss = 0
        correct = 0
        for data, target in test_loader:
            if self.args.cuda:
                data, target = data.cuda(), target.cuda()
            target = Variable(target)
            target_ = target.view(-1,1)
            batch_size = target_.size()[0]
            data = data.view(batch_size,-1)
            ##convert int target to one-hot vector
            data = Variable(data)
            if not batch_size == self.args.batch_size: #because we have to initialize parameters for batch_size, tensor not matches with batch size cannot be trained
                self.define_extras(batch_size)
            self.target_onehot.data.zero_()            
            self.target_onehot.scatter_(1, target_, 1.)
            _, output = self.cal_loss(data, self.target_onehot)
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum()
        accuracy = 100. * correct / len(test_loader.dataset)
        print('\nTest set: Accuracy: {}/{} ({:.4f}%)\n'.format(
            correct, len(test_loader.dataset),
            accuracy))
        self.test_acc.append(accuracy)

        if accuracy > self.best_accuracy:
            self.save_best('./result')
            self.best_accuracy = accuracy

    def save_best(self, path):
        try:
            os.makedirs('./result')
        except:
            print('directory ./result already exists')

        with open(os.path.join(path, 'best_model.pkl'), 'wb') as output_file:
            pickle.dump(self, output_file)

In [None]:
#This is driver code used to train the soft decision tree model
#https://github.com/kimhc6028/soft-decision-tree/blob/master/main.py
from __future__ import print_function
import sys,os
import argparse
import pickle
import torch
from torchvision import datasets, transforms
sys.argv = ['']

#training_data = <-- define to get rid of name error

# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                    help='input batch size for training (default: 64)')
parser.add_argument('--input-dim', type=int, default=len(training_data.columns), metavar='N', #set to number of columns we are using
                    help='input dimension size(default: 28 * 28)')
parser.add_argument('--output-dim', type=int, default=10, metavar='N',
                    help='output dimension size(default: 10)')
parser.add_argument('--max-depth', type=int, default=8, metavar='N',
                    help='maximum depth of tree(default: 8)')
parser.add_argument('--epochs', type=int, default=5, metavar='N',
                    help='number of epochs to train (default: 40)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                    help='learning rate (default: 0.01)')
parser.add_argument('--lmbda', type=float, default=0.1, metavar='LR',
                    help='temperature rate (default: 0.1)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                    help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
                    help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                    help='how many batches to wait before logging training status')



args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# try:
#     os.makedirs('./data')
# except:
#     print('directory ./data already exists')

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

train_loader = torch.utils.data.DataLoader(
    PitchingDataset(bball_data_2_file,data_folder,train=True,validation=False),
    batch_size=args.batch_size, shuffle=True, **kwargs)

# validation_loader = torch.utils.data.DataLoader(
#     PitchingDataset(bball_data_2_file,data_folder,train=False,validation=True),
#     batch_size=args.batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    PitchingDataset(bball_data_2_file,data_folder,train=False,validation=False),
    batch_size=args.batch_size, shuffle=True, **kwargs)

# train_loader = torch.utils.data.DataLoader(
#     datasets.MNIST('./data', train=True, download=True,
#                    transform=transforms.Compose([
#                        transforms.ToTensor(),
#                        transforms.Normalize((0.1307,), (0.3081,))
#                    ])),
#     batch_size=args.batch_size, shuffle=True, **kwargs)
# test_loader = torch.utils.data.DataLoader(
#     datasets.MNIST('./data', train=False, transform=transforms.Compose([
#         transforms.ToTensor(),
#         transforms.Normalize((0.1307,), (0.3081,))
#     ])),
#     batch_size=args.batch_size, shuffle=True, **kwargs)

def save_result(acc):
    try:
        os.makedirs('./result')
    except:
        print('directory ./result already exists')
    filename = os.path.join('./result/', 'bp.pickle')
    f = open(filename,'wb')
    pickle.dump(acc, f)
    f.close()

model = SoftDecisionTree(args)

if args.cuda:
    model.cuda()

for epoch in range(1, args.epochs + 1):
    model.train_(train_loader, epoch)
    model.test_(test_loader, epoch)

save_result(model)