In [1]:
# Package imports
from pandas import DataFrame, read_csv
from numpy import ceil
from keras import models, layers
from keras.utils import to_categorical

Using TensorFlow backend.


In [55]:
# Global variables
DIR_DATA = r"C:\Users\liaml\Dropbox\Projects\TestMatch\Data Analysis\data"
TRAIN_PROP = 0.7

# Load data
raw_data = DataFrame()
filenames = ['04092019-AUSENG-1AUS', '03122019-NZLENG-1NZL']
for f in filenames:
    raw_data = raw_data.append(read_csv(DIR_DATA + "/" + f + ".csv"))

    
# Determine all outcomes present in data
OUTCOME_INT = {}
ind = 1
for o in raw_data['outcome']:
    if o not in OUTCOME_INT.keys() and o != 'W':
        OUTCOME_INT.update({o: ind})
        ind += 1

# Ensure wicket is first index
OUTCOME_INT.update({'W': 0})

# Remove all rows with "-"
raw_data = raw_data[raw_data.bowl_avg != "-"]
raw_data = raw_data[raw_data.bat_avg != "-"]

# Shuffle data and split into training, testing
shuffled_data = raw_data.sample(frac=1)
break_ind = ceil(TRAIN_PROP*len(shuffled_data.index)).astype(int)
train_data = shuffled_data.iloc[:break_ind]
test_data = shuffled_data.iloc[break_ind:]

print("No. outcomes in dataset: %i" % ind)
print("No. training samples: %i" % len(train_data))
print("No. test samples: %i" % len(test_data))

# Prepare data
train_outcome = to_categorical([OUTCOME_INT[o] for o in train_data['outcome'].values])
train_input = train_data.drop('outcome', axis=1).values
test_outcome = to_categorical([OUTCOME_INT[o] for o in train_data['outcome'].values])
test_input = train_data.drop('outcome', axis=1).values
print(train_input)

# Delete data to save space
del raw_data, shuffled_data, break_ind, train_data, test_data

No. outcomes in dataset: 16
No. training samples: 1052
No. test samples: 450
[[1 7 424 ... 3 '28.68' 2]
 [1 5 319 ... 3 '28.68' 2]
 [1 7 394 ... 2 '44.77' 1]
 ...
 [1 2 84 ... 0 '44.77' 1]
 [1 3 160 ... 1 '44.77' 1]
 [1 5 248 ... 1 '31.37' 7]]


## Data Description
Each individual delivery in a test match is encapsulated by the following variables:

| Name | Description |
| --- | --- |
| ```outcome``` | Outcome of delivery |
| ```dismiss``` | Mode of dimissial, 'no' indicates not out |
| ```innings``` | Innings in match |
| ```inns_balls``` | Balls bowled in innings *before* the delivery is bowled |
| ```team_wkts``` | Wickets fallen in innings before the delivery is bowled |
| ```team_score``` | Score of batting team before the delivery is bowled |
| ```team_lead``` | Lead of batting team before the delivery is bowled. Negative values indicate a deficit |
| ```bat_score``` | Score of batter facing the delivery before it is bowled |
| ```bat_balls``` | Number of balls faced by the batter facing the delivery *before* it is bowled |
| ```bat_avg``` | Batting average of batter facing the delivery before the start of the match |
| ```bat_arm``` | The hand used by the batter facing the delivery |
| ```bowl_balls``` | The number of balls bowled (including extras) in the innings by the bowler before the delivery is bowled |
| ```bowl_runs``` | The number of runs conceded in the innings by the bowler before the delivery is bowled |
| ```bowl_wkts``` | The number of wickets taken in the innings by the bowler before the delivery is bowled |


The categorical values with non-numeric levels are encoded as follows:
* Outcome of delivery

## Possible Loss Functions
* **Brier score:** For $N$ samples and $R$ possible classes,
    $$
        BS = \frac1N\sum_{t=1}^N{\sum_{i=1}^N\left(f_{i} - o_{ti} \right)},
    $$
    where $f_{ti}$ is the predicted probability that sample $t$ is in class $i$, and 
    $$
    o_{ti} = \begin{cases}
        1, & \text{sample } t \text{ is actually in class } i, \\
        0, & \text{otherwise}.
    \end{cases}
    $$


In [58]:
# Setup model
network = models.Sequential()
network.add(layers.Dense(416, activation='relu', input_shape=(13,)))
network.add(layers.Dense(ind, activation='softmax'))

# Compile
network.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [61]:
# Train model
network.fit(train_input, train_outcome, epochs=25, batch_size=128)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.callbacks.History at 0x294c909fc88>