<a href="https://colab.research.google.com/github/inforeqd512/QLearning/blob/main/Tribe_Q_learning_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [431]:
#import libraries
import numpy as np
import math
import copy #for deepcopy()
from enum import Enum
!python -V

Python 3.7.11


In [432]:
class Business_Rules:

  def __init__(self):
    self.target_dr_per_lead = 5
    self.actions = ['reassign', 'promote_dr', 'hire']
    self.ceil_total_leaders_leading = 10
    self.ceil_total_people_in_grade = 50
    self.grades = ["3", "4.1", "4.2", "4.3", "4.4", "5.1", "5.2", "MS"]


In [433]:
class Chapter_Stats:
  def __init__(self, chapter, total_leaders_leading, people_gradewise = {"3":0, "4.1":0, "4.2":0, "4.3":0, "4.4":0, "5.1":0, "5.2":0, "MS":0}):
    self.chapter=chapter
    self.total_leaders_leading=total_leaders_leading
    self.people_gradewise=people_gradewise

    total_people=0
    total_leader_grades=0
    for grade in people_gradewise:
      total_people = total_people + people_gradewise[grade]
      if grade in ["3", "4.1", "4.2", "4.3"]:
        total_leader_grades = total_leader_grades + people_gradewise[grade]
    
    self.total_DRs = total_people - total_leaders_leading
    self.total_leader_grades = total_leader_grades

  def print_info(self):
    print("Chapter_Stats")
    print(self.chapter, self.total_leaders_leading, self.total_DRs, self.people_gradewise, self.total_leader_grades)

In [434]:
class State:

    def __init__(self, chapter_stats, business_rules):
      self.total_leaders_leading=chapter_stats.total_leaders_leading
      print(chapter_stats.total_DRs / chapter_stats.total_leaders_leading)
      self.drs_per_lead = math.ceil(chapter_stats.total_DRs / chapter_stats.total_leaders_leading)
      self.total_leader_grades_unutilised = chapter_stats.total_leader_grades - chapter_stats.total_leaders_leading
      self.target_dr_per_lead = business_rules.target_dr_per_lead
      self.dr_per_lead_gt_target_and_leaders_unutilised = (self.drs_per_lead > self.target_dr_per_lead) and (self.total_leader_grades_unutilised > 0)

    def print_info(self):
      print("\n\ntotal_leaders_leading", self.total_leaders_leading, \
            "\ndrs_per_lead", self.drs_per_lead, \
            "\ntotal_leader_grades_unutilised", self.total_leader_grades_unutilised, \
            "\ntarget_dr_per_lead", self.target_dr_per_lead, \
            "\ndr_per_lead_gt_target_and_leaders_unutilised", self.dr_per_lead_gt_target_and_leaders_unutilised)

    def state_hash(self):
      hash_value = hash((self.total_leaders_leading, self.drs_per_lead, self.total_leader_grades_unutilised, self.target_dr_per_lead, self.dr_per_lead_gt_target_and_leaders_unutilised))
      return hash_value

    def state_copy(self):
      new_state = copy.deepcopy(self)
      return new_state

In [435]:
class ActionList:

    def __init__(self, action_strings):
      self.actions=[]
      self.list_action_hash=[]
      for action in action_strings:
        ac = Action(action)
        self.actions.append(ac)
        self.list_action_hash.append(ac.action_hash)

In [436]:
class Action:

  def __init__(self, action_string):
    self.action = action_string
    self.action_hash = hash(action_string)
    return

  def perform_action(self, state):
    if self.action == "reassign":
      return Reassign.run(state)
    elif self.action == "promote_dr":
      return Promote_DR.run(state)
    elif self.action == "hire":
      return Hire.run(state)

  def print_info(self):
    print(self.action, self.action_hash)

In [437]:
class Reassign:

  def __init__(self):
    #do nothing now
    return

  @classmethod
  def run(self, state):
    new_state = self.reassigned_leader(state)
    return new_state

  @classmethod
  def reassigned_leader(self, state):
    if state.dr_per_lead_gt_target_and_leaders_unutilised:
      new_state = state.state_copy()
      total_drs = state.total_leaders_leading * state.drs_per_lead
      new_state.total_leaders_leading += 1
      new_state.total_leader_grades_unutilised -= 1
      total_drs -= 1
      new_state.drs_per_lead = total_drs / new_state.total_leaders_leading
      new_state.dr_per_lead_gt_target_and_leaders_unutilised = (new_state.drs_per_lead > new_state.target_dr_per_lead) and (new_state.total_leader_grades_unutilised > 0)
      return new_state
    else:
      return state

In [438]:
class Promote_DR:
  def __init__(self):
    return

  @classmethod
  def run(self, state):
    new_state = state #do nothing for now so return the same state
    return new_state

In [439]:
class Hire:
  def __init__(self):
    return

  @classmethod
  def run(self, state):
    new_state = state #do nothing for now so return the same state
    return new_state

In [440]:
class Q_table:
  """
  dict[key(state hash, action hash)] = q-values
  """
  
  def __init__(self):
    self.Q = {}
    return

  def getStateActionHashPairKey(self, state_hash, action_hash):
    """ Returns state-pair hash key, requires separate state and action hash keys first """
    # return state_hash*action_hash
    return str(state_hash)+" "+str(action_hash)

  def getActionHashListForNewState(self, actions):
    list_action_hash = []
    for action in actions:
      list_action_hash.append(action.action_hash)
    return list_action_hash

  def getValueQ(self, state_hash, action_hash):
    """ Get expected reward given an action in a given state,
        returns 0 if the state-action pair has not been seen before.
        Input is state and action hash key                          """

    state_action_key = self.getStateActionHashPairKey(state_hash, action_hash)
    if state_action_key in self.Q:
        return self.Q.get(state_action_key)
    else:
        self.Q[state_action_key] = 0
        return 0

  def setValueQ(self, state_hash, action_hash, value):
    """ Set value in Q """
    state_action_key = self.getStateActionHashPairKey(state_hash, action_hash)
    self.Q[state_action_key] = value

  def getBestAction(self, state_hash, list_action_hash, list_actions):
    """ Get best action given a set of possible actions in a given state """

    best_action = list_actions[0]

    # Find action that given largest Q in given state
    maxQ = 0
    for a_hash, action in zip(list_action_hash, list_actions):
        tmpQ = self.getValueQ(state_hash, a_hash)
        if maxQ < tmpQ:
          maxQ = tmpQ
          best_action = action

    return best_action

  def getMaxQ(self, state_hash, list_action_hash):
      """ Returns the maximum Q value given a state and list of actions (input is hash keys) """
      maxQ = 0
      for a_hash in list_action_hash:
          tmpQ = self.getValueQ(state_hash, a_hash) 
          if maxQ < tmpQ:
              maxQ = tmpQ
      return maxQ


  def updateQ(self, old_state, action_to_perform, reward, new_state, next_possible_actions, discount_factor, learning_rate):
    """ Implements Q-learning iterative algorithm """

    state_hash = old_state.state_hash()
    action_hash = action_to_perform.action_hash

    # Get current Q Value
    old_q_value = self.getValueQ(state_hash, action_hash)

    # Find max Q value given the possible set of actions in the next state
    list_action_hash = self.getActionHashListForNewState(next_possible_actions)
    max_nextQ = self.getMaxQ(new_state.state_hash(), list_action_hash) 
    
    temporal_difference = reward + (discount_factor * max_nextQ) - old_q_value

    #update the Q-value for the previous state and action pair
    new_q_value = old_q_value + (learning_rate * temporal_difference)
    self.setValueQ(state_hash, action_hash, new_q_value)
    # print("updateQ", state_hash, action_hash, new_q_value)

  def print_info(self):
    print("\n\nQ_table")
    print(self.Q)

In [441]:
class Reward:
  def __init__(self):
    return
  
  def reward_function(self, old_state, action_to_perform):
    """ 
    Returns positive value if actions gets closer to target state and values are the size of step closer to the target, 
    Returns -ve value if action take away from target
    """
    new_state = action_to_perform.perform_action(old_state)

    if new_state.total_leader_grades_unutilised == 0:
      reward = -100 #possible promotion scenario
      return reward
    
    if new_state.drs_per_lead == 0:
      reward = -100
    elif new_state.drs_per_lead > 0 and new_state.drs_per_lead <= new_state.target_dr_per_lead:
      reward = 100
    else:
      reward = old_state.drs_per_lead - new_state.drs_per_lead

    return reward

In [442]:
class Environment:

  def __init__(self):
    self.business_rules = Business_Rules()
    self.action_list = ActionList(self.business_rules.actions)
    self.reward = Reward()

  def possible_actions_in_state(self, state):
    return self.action_list.actions

  def hasReachedTerminalState(self, reward):
    if reward == 100 or reward == -100:
      return True
    else:
      return False

In [443]:
class Episode:

  preconditioned_episodes = [
            {"chapter": "Analyst", "total_leaders_leading":2, \
             "people_gradewise" : {"3":0, "4.1":1, "4.2":6, "4.3":6, "4.4":1, "5.1":0, "5.2":2, "MS":7} },
            {"chapter": "ios", "total_leaders_leading":1, \
             "people_gradewise" : {"3":0, "4.1":0, "4.2":5, "4.3":6, "4.4":2, "5.1":0, "5.2":0, "MS":14} },
            {"chapter": "android", "total_leaders_leading":1, \
             "people_gradewise" : {"3":0, "4.1":0, "4.2":4, "4.3":4, "4.4":4, "5.1":0, "5.2":0, "MS":9} },
            {"chapter": "be engg", "total_leaders_leading":2.5, \
             "people_gradewise" : {"3":2.5, "4.1":3, "4.2":4, "4.3":6, "4.4":5, "5.1":1, "5.2":2, "MS":27} },
            {"chapter": "delivery leads", "total_leaders_leading":1, \
             "people_gradewise" : {"3":0, "4.1":1, "4.2":3.5, "4.3":0.5, "4.4":0.8, "5.1":0, "5.2":0, "MS":3} },
            {"chapter": "environment", "total_leaders_leading":1, \
             "people_gradewise" : {"3":0, "4.1":0, "4.2":2, "4.3":1, "4.4":1, "5.1":0, "5.2":0, "MS":2} },
            {"chapter": "sre", "total_leaders_leading":1, \
             "people_gradewise" : {"3":1, "4.1":1, "4.2":1, "4.3":7, "4.4":4, "5.1":0, "5.2":0, "MS":3} },
            {"chapter": "testing", "total_leaders_leading":1, \
             "people_gradewise" : {"3":0, "4.1":0, "4.2":1.5, "4.3":3, "4.4":2, "5.1":1, "5.2":0, "MS":18} },
            {"chapter": "reassign wont work", "total_leaders_leading":2.5, \
             "people_gradewise" : {"3":2.5, "4.1":3, "4.2":0, "4.3":0, "4.4":5, "5.1":1, "5.2":2, "MS":27} },
  ]

  def __init__(self, episode_number):
    self.episode_number = episode_number
    self.state = None
    if self.episode_number < len(self.preconditioned_episodes):
      business_rules = Business_Rules()
      dict = self.preconditioned_episodes[self.episode_number]
      print("preconditioned_episode :\n", dict)
      chapter_stats = Chapter_Stats(dict["chapter"], dict["total_leaders_leading"], dict["people_gradewise"])
      self.state = State(chapter_stats, business_rules)
    else:
      business_rules = Business_Rules()
      total_leaders_leading = np.random.randint(1,business_rules.ceil_total_leaders_leading)
      people_gradewise={}
      for grade in business_rules.grades:
        people_gradewise[grade] = np.random.randint(business_rules.ceil_total_people_in_grade)
      chapter_stats = Chapter_Stats("chapter", total_leaders_leading, people_gradewise)
      print("simulated episode data :")
      chapter_stats.print_info()
      self.state = State(chapter_stats, business_rules)

    return

In [453]:
class Trainer:

  def __init__(self):
    self.q_table = Q_table()
    self.environment = Environment()

    #define training parameters
    self.epsilon = 0.9 #the percentage of time when we should take the best action (instead of a random action)
    self.discount_factor = 0.9 #discount factor for future rewards
    self.learning_rate = 0.9 #the rate at which the AI agent should learn
    self.num_episodes = 15 #1000

    for action in self.environment.action_list.actions:
      action.print_info()

    return

  
  #define an epsilon greedy algorithm that will choose which action to take next 
  def get_next_action(self, state_hash, list_action_hash, list_actions, epsilon):
    #if a randomly chosen value between 0 and 1 is less than epsilon, 
    #then choose the most promising value from the Q-table for this state.
    if np.random.random() < epsilon:
      best_action = self.q_table.getBestAction(state_hash, list_action_hash, list_actions)
      return best_action
    else: #choose a random action
      random_idx = np.random.choice(len(list_actions))
      random_action = list_actions[random_idx]
      return random_action

  def train(self, should_print):
    #run through 1000 training episodes
    for episode_number in range(self.num_episodes):
      #get the starting state for this episode
      episode = Episode(episode_number)
      print("episode_number : ", episode.episode_number)

      #continue taking actions  until we reach a terminal state
      #(i.e., until we reach the final team structure or hit a bad structure)
      while True:

        #choose which action to take 
        action_to_perform = self.get_next_action(episode.state.state_hash(), \
                                                 self.environment.action_list.list_action_hash, \
                                                 self.environment.action_list.actions, \
                                                 self.epsilon)

        #perform the chosen action, and transition to the next state 
        old_state = episode.state
        new_state = action_to_perform.perform_action(old_state)

        #receive the reward for moving to the new state
        reward = self.environment.reward.reward_function(old_state, action_to_perform)

        if should_print:
          print("episode state : ")
          episode.state.print_info()        
          print("\n action_to_perform :")
          action_to_perform.print_info()
          print("\n new_state :")
          new_state.print_info()        
          print("\nreward:", reward)

        if self.environment.hasReachedTerminalState(reward):
          episode.state = new_state
          print("breaking... terminal state \n")
          break

        next_possible_actions = self.environment.possible_actions_in_state(new_state)
        self.q_table.updateQ(old_state, action_to_perform, reward, new_state, next_possible_actions, self.discount_factor, self.learning_rate)

        episode.state = new_state

 #Define a function that will get the shortest path between the starting out team structure and the most favorable team structure 
  def get_shortest_path(self, old_state):
    i=0
    shortest_path = []
    shortest_path.append(old_state)
    while True:
      action_to_perform = self.get_next_action(old_state.state_hash(), \
                                               self.environment.action_list.list_action_hash, \
                                               self.environment.action_list.actions, 
                                               1.) #always the best action

      new_state = action_to_perform.perform_action(old_state)
      shortest_path.append(new_state)

      reward = self.environment.reward.reward_function(old_state, action_to_perform)
      print("reward :", reward)
      if self.environment.hasReachedTerminalState(reward):
        episode.state = new_state
        break

      old_state = new_state
    
    for state in shortest_path:
      state.print_info()

# TESTING

In [454]:
trainer = Trainer()

trainer.train(False)

reassign 6845558906778704510
promote_dr -5681847432539490348
hire -8938540315845285804
preconditioned_episode :
 {'chapter': 'Analyst', 'total_leaders_leading': 2, 'people_gradewise': {'3': 0, '4.1': 1, '4.2': 6, '4.3': 6, '4.4': 1, '5.1': 0, '5.2': 2, 'MS': 7}}
10.5
episode_number :  0
breaking... terminal state 

preconditioned_episode :
 {'chapter': 'ios', 'total_leaders_leading': 1, 'people_gradewise': {'3': 0, '4.1': 0, '4.2': 5, '4.3': 6, '4.4': 2, '5.1': 0, '5.2': 0, 'MS': 14}}
26.0
episode_number :  1
breaking... terminal state 

preconditioned_episode :
 {'chapter': 'android', 'total_leaders_leading': 1, 'people_gradewise': {'3': 0, '4.1': 0, '4.2': 4, '4.3': 4, '4.4': 4, '5.1': 0, '5.2': 0, 'MS': 9}}
20.0
episode_number :  2
breaking... terminal state 

preconditioned_episode :
 {'chapter': 'be engg', 'total_leaders_leading': 2.5, 'people_gradewise': {'3': 2.5, '4.1': 3, '4.2': 4, '4.3': 6, '4.4': 5, '5.1': 1, '5.2': 2, 'MS': 27}}
19.2
episode_number :  3
breaking... terminal

In [455]:
trainer.q_table.print_info()



Q_table
{'5285223899660918271 6845558906778704510': 3.6, '5285223899660918271 -5681847432539490348': 0, '5285223899660918271 -8938540315845285804': 0, '-4678231873890659421 6845558906778704510': 0, '-4678231873890659421 -5681847432539490348': 0, '-4678231873890659421 -8938540315845285804': 0, '-7636548818605238522 6845558906778704510': 12.15, '-7636548818605238522 -5681847432539490348': 0, '-7636548818605238522 -8938540315845285804': 0, '-2340752886812058080 6845558906778704510': 4.05, '-2340752886812058080 -5681847432539490348': 0, '-2340752886812058080 -8938540315845285804': 0, '-2746232271926472364 6845558906778704510': 2.025, '-2746232271926472364 -5681847432539490348': 0, '-2746232271926472364 -8938540315845285804': 0, '-621711864479958001 6845558906778704510': 0, '-621711864479958001 -5681847432539490348': 0, '-621711864479958001 -8938540315845285804': 0, '-391274547413878359 6845558906778704510': 9.450000000000001, '-391274547413878359 -5681847432539490348': 0, '-3912745474138

In [456]:
episode = Episode(8)
trainer.get_shortest_path(episode.state)

preconditioned_episode :
 {'chapter': 'reassign wont work', 'total_leaders_leading': 2.5, 'people_gradewise': {'3': 2.5, '4.1': 3, '4.2': 0, '4.3': 0, '4.4': 5, '5.1': 1, '5.2': 2, 'MS': 27}}
15.2
reward : 4.857142857142858
reward : 2.6984126984126977
reward : -100


total_leaders_leading 2.5 
drs_per_lead 16 
total_leader_grades_unutilised 3.0 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True


total_leaders_leading 3.5 
drs_per_lead 11.142857142857142 
total_leader_grades_unutilised 2.0 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True


total_leaders_leading 4.5 
drs_per_lead 8.444444444444445 
total_leader_grades_unutilised 1.0 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True


total_leaders_leading 5.5 
drs_per_lead 6.7272727272727275 
total_leader_grades_unutilised 0.0 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised False


In [448]:
#Test Chapter_Stats
cs = Chapter_Stats("Analyst",2,{"3":0, "4.1":1, "4.2":6, "4.3":6, "4.4":1, "5.1":0, "5.2":2, "MS":7})
cs.print_info()

Chapter_Stats
Analyst 2 21 {'3': 0, '4.1': 1, '4.2': 6, '4.3': 6, '4.4': 1, '5.1': 0, '5.2': 2, 'MS': 7} 13


In [449]:
#Test State
st=State(cs,Business_Rules())
st.print_info()

print(round(1.5))
print(round(2.5))

print(st)
st2 = Reassign.reassigned_leader(st)
st2.print_info()

10.5


total_leaders_leading 2 
drs_per_lead 11 
total_leader_grades_unutilised 11 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True
2
2
<__main__.State object at 0x7f96ae2d2ad0>


total_leaders_leading 3 
drs_per_lead 7.0 
total_leader_grades_unutilised 10 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True


In [450]:
st1=State(cs,Business_Rules())
st1.total_leaders_leading = 100
st1.drs_per_lead = 100
st1.total_leader_grades_unutilised = 100


st2=State(cs,Business_Rules())
st2.total_leaders_leading = 100
st2.drs_per_lead = 100
st2.total_leader_grades_unutilised = 100

print(hash((100,100,100,4,1)))

print(hash((100,100,100,4,True)))

print(hash((50,200,100,4,True)))

print(hash(st1))
print(hash(st2))

print(st1.state_hash())
print(st2.state_hash())


10.5
10.5
-7260956807247127806
-7260956807247127806
-8208932399249847992
8767821500933
8767821500985
-7260955147838557169
-7260955147838557169


In [451]:
al = ActionList(Business_Rules().actions)
print(al.actions)
print(al.list_action_hash)

[<__main__.Action object at 0x7f96ae341950>, <__main__.Action object at 0x7f96b252f150>, <__main__.Action object at 0x7f96b252f810>]
[6845558906778704510, -5681847432539490348, -8938540315845285804]


In [452]:
rw = Reward()
act = Action("reassign")

st.print_info()

value = rw.reward_function(st, act)
print("next", 2)
print(value)
new_state = act.perform_action(st)
new_state.print_info()

value = rw.reward_function(new_state, act)
print("next", 3)
print(value)
new_state = act.perform_action(new_state)
new_state.print_info()

value = rw.reward_function(new_state, act)
print("next", 4)
print(value)
new_state = act.perform_action(new_state)
new_state.print_info()

value = rw.reward_function(new_state, act)
print("next", 5)
print(value)



total_leaders_leading 2 
drs_per_lead 11 
total_leader_grades_unutilised 11 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True
next 2
4.0


total_leaders_leading 3 
drs_per_lead 7.0 
total_leader_grades_unutilised 10 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised True
next 3
100


total_leaders_leading 4 
drs_per_lead 5.0 
total_leader_grades_unutilised 9 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised False
next 4
100


total_leaders_leading 4 
drs_per_lead 5.0 
total_leader_grades_unutilised 9 
target_dr_per_lead 5 
dr_per_lead_gt_target_and_leaders_unutilised False
next 5
100
