In [1]:
import numpy as np
import time

In [2]:
class Agent:
    def __init__(self, num_balls, num_matches):
        self.total_wickets = 4
        self.num_balls = num_balls
        self.num_matches = num_matches
        self.total_batsman = self.total_wickets

        self.q_values = np.ndarray((self.num_balls, self.total_wickets, self.total_batsman, 6), dtype=np.float32)
        self.q_values.fill(0.0)

        self.epsilon = 0.1
        self.alpha = 0.1
        self.gamma = 0.9

        self.balls = 0
        self.wickets = 0
        self.batsman_order = np.array([0, 1, 2, 3])
        self.match_count = 0
        self.last_action = None
        self.next_action = None
        self.possible_orders = None

        self.generate_possible_orders()

    def get_batsman(self, index: int = None):
        if index is None:
            return self.batsman_order[self.wickets]

        return self.batsman_order[index]

    def policy(self):
        p = np.random.rand()
        if p < self.epsilon:
            return np.random.randint(0, 6)
        else:
            return np.argmax(self.q_values[self.balls][self.wickets][self.get_batsman()])

    def generate_possible_orders(self):
        possible_orders = []
        for i in range(4):
            for j in range(4):
                if i == j:
                    continue
                for k in range(4):
                    if i == k or j == k:
                        continue
                    for x in range(4):
                        if i == x or j == x or k == x:
                            continue
                        possible_orders.append([i, j, k, x])
        self.possible_orders = possible_orders

    def get_batting_order(self):
        while self.match_count < 10 * len(self.possible_orders):
            self.batsman_order = self.possible_orders[self.match_count % 24]
            self.match_count += 1
            return self.batsman_order

        best_order = None
        best_score = -1

        for order in self.possible_orders:
            score = 0
            for i, batsman in enumerate(order):
                q_values = self.q_values[:, i, batsman, :]
                score += np.sum(np.max(q_values, axis=1))
            if score > best_score:
                best_score = score
                best_order = order

        p = np.random.rand()
        if p < self.epsilon:
            best_order = self.possible_orders[np.random.randint(0, 24)]
        else:
            best_order = best_order

        self.batsman_order = best_order
        return self.batsman_order
     
    def get_action(self, wicket, runs_scored):
        if self.balls == 0:
            self.last_action = self.policy()
            self.balls += 1
            return self.last_action

        self.wickets += wicket
        self.balls += 1

        if self.balls == self.num_balls or self.wickets == self.total_wickets:
            self.balls = 0
            self.wickets = 0
            self.last_action = None
            self.next_action = None

            self.last_action = self.policy()
            self.balls += 1
            return self.last_action

        q_values_old = self.q_values[self.balls - 1][self.wickets - wicket][
            self.get_batsman(self.wickets - wicket)
        ]
        q_values_new = self.q_values[self.balls][self.wickets][self.get_batsman()]

        new_q_value = 0
        self.next_action = self.policy()
        new_q_value = (
            runs_scored
            - 0.5
            + self.gamma * q_values_new[self.next_action]
            - q_values_old[self.last_action]
        )

        q_values_old[self.last_action] += self.alpha * new_q_value

        self.last_action = self.next_action
        return self.last_action

In [3]:
wickets = 4

p_out = np.array([0.001, 0.01, 0.02, 0.03, 0.1, 0.3])
p_out2 = np.array([0.003, 0.03, 0.04, 0.04, 0.2, 0.35])
p_out3 = np.array([0.007, 0.05, 0.06, 0.08, 0.3, 0.35])
p_out4 = np.array([0.01, 0.08, 0.08, 0.1, 0.2, 0.4])

p_run = np.array([1, 0.9, 0.85, 0.8, 0.75, 0.7])
p_run2 = np.array([0.95, 0.85, 0.8, 0.75, 0.7, 0.65])
p_run3 = np.array([0.9, 0.8, 0.75, 0.7, 0.65, 0.4])
p_run4 = np.array([0.85, 0.7, 0.6, 0.55, 0.55, 0.3])


class Environment:
    def __init__(self, num_balls, agent):
        self.num_balls = num_balls
        self.agent = agent
        self.__run_time = 0
        self.__total_runs = 0
        self.__total_wickets = 0
        self.__runs_scored = 0
        self.__start_time = 0
        self.__end_time = 0
        self.__p_out = np.array([p_out, p_out2, p_out3, p_out4])
        self.__p_run = np.array([p_run, p_run2, p_run3, p_run4])
        self.__action_runs_map = np.array([0, 1, 2, 3, 4, 6])
        self.__wickets_left = wickets
        self.__wicket = 0
        self.__runs_scored = 0
        self.__start_time = 0
        self.__end_time = 0
        self.__batting_order = np.array([0, 1, 2, 3])
        self.__current_batter = self.__batting_order[self.__wickets_left - 1]

    def __get_action(self):
        self.__start_time = time.time()
        action = self.agent.get_action(self.__wicket, self.__runs_scored)
        self.__end_time = time.time()
        self.__run_time = self.__run_time + self.__end_time - self.__start_time
        return action

    def __get_outcome(self, action):
        self.__current_batter = self.__batting_order[self.__wickets_left - 1]
        pout = self.__p_out[self.__current_batter][action]
        prun = self.__p_run[self.__current_batter][action]
        wicket = np.random.choice(2, 1, p=[1 - pout, pout])[0]
        runs = 0
        if wicket == 0:
            runs = (
                self.__action_runs_map[action]
                * np.random.choice(2, 1, p=[1 - prun, prun])[0]
            )
        return wicket, runs

    def innings(self):
        self.__wickets_left = wickets
        self.__runs_scored = 0
        self.__total_runs = 0
        self.__total_wickets = 0
        self.__run_time = 0
        self.__start_time = 0
        self.__end_time = 0

        self.__batting_order = self.agent.get_batting_order()

        for ball in range(self.num_balls):
            if self.__wickets_left > 0:
                action = self.__get_action()
                self.__wicket, self.__runs_scored = self.__get_outcome(action)
                self.__total_runs = self.__total_runs + self.__runs_scored
                if self.__wicket > 0:
                    self.__wickets_left = self.__wickets_left - 1
                self.__total_wickets = self.__total_wickets + self.__wicket
                if self.__wickets_left == 0:
                    self.__get_action()
        return self.__total_runs, self.__total_wickets, self.__run_time

In [4]:
num_matches = 1000
num_balls = 60
agent = Agent(num_balls, num_matches)
environment = Environment(num_balls, agent)
score = np.zeros((num_matches, 1))
run_time = np.zeros((num_matches, 1))
wicket = np.zeros((num_matches, 1))

last_100_avgs = []
for i in range(num_matches):
    score[i], wicket[i], run_time[i] = environment.innings()
    last_100_avgs.append(score[i])
    if (i + 1) % 100 == 0:
#         print(agent.batsman_order)
#         print("Score: ", score[i], "Wickets: ", wicket[i])
        print("Average of last 100 matches: ", np.mean(last_100_avgs))
        last_100_avgs = []
        

Average of last 100 matches:  52.51
Average of last 100 matches:  62.22
Average of last 100 matches:  60.1
Average of last 100 matches:  60.73
Average of last 100 matches:  56.49
Average of last 100 matches:  60.54
Average of last 100 matches:  60.17
Average of last 100 matches:  59.3
Average of last 100 matches:  64.81
Average of last 100 matches:  62.31
