# Bellman for TicTacToe

In [3]:
import numpy as np
from typing import List
   
def bellman_v(v:np.ndarray, policy:List, Rsa:List, Psas:np.ndarray, 
        s:int=0, forgetting_factor:float=1.0) -> float:
    Gs = 0
    for a in range(len(policy[s])):
        Gs += policy[s][a] * bellman_q_by_v(v, Rsa, Psas, 
                s=s, a=a, forgetting_factor=forgetting_factor)
    return Gs

def bellman_q_by_v(v:np.ndarray, Rsa:List, Psas:np.ndarray, 
        s:int=0, a:int=0, forgetting_factor:float=1.0) -> float:
    reward = Rsa[s][a]
    v_next = 0
    for next_s in range(len(Psas[s,a])):
        if Psas[s,a,next_s] and next_s < len(v) - 1:
            v_next += Psas[s,a,next_s] * v[next_s]     
    Gs = reward + forgetting_factor * v_next
    return Gs

def bellman_q(q:List, policy:List, Rsa:List, Psas:np.ndarray, 
        s:int=0, a:int=0, forgetting_factor:float=1.0) -> float:
    reward = Rsa[s][a]
    v_next = 0
    for next_s in range(len(Psas[s,a])):
        if Psas[s,a,next_s] and next_s < len(q) - 1:
            v = 0
            for next_a in range(len(policy[next_s])):
                v += policy[next_s][next_a] * q[next_s][next_a]
            v_next += Psas[s,a,next_s] * v     
    Gs = reward + forgetting_factor * v_next
    return Gs

class MDP:
    def __init__(self):
        self.init_policy_Rsa()
        
    def init_policy_Rsa(self):
        # policy[state][action] = 0.9 (<-- prob)
        # None state is the last state        
        self.policy_actions_table = [['Facebook', 'Quit'], ['Facebook', 'Study'], 
            ['Sleep', 'Study'], ['Pub', 'Study'], None]
        self.Rsa = [[-1,0], [-1,-2], [0, -2], [1,10]]
        self.N_states = len(self.policy_actions_table)
        self.policy = []
        for actions in self.policy_actions_table:
            if actions:
                N_actions = len(actions)
                self.policy.append(np.ones(N_actions)/N_actions)
                
        # policy가 고려되지 않은 관계임. Policy에 따른 가중에 별도로 고려되어야 함.
        self.Psas = np.zeros([self.N_states, 2, self.N_states]) # Probability
        self.Psas[0,0,0], self.Psas[0,1,1] = 1.0, 1.0
        self.Psas[1,0,0], self.Psas[1,1,2] = 1.0, 1.0
        self.Psas[2,0,4], self.Psas[2,1,3] = 1.0, 1.0
        self.Psas[3,0,1], self.Psas[3,0,2], \
            self.Psas[3,0,3], self.Psas[3,1,4] = 0.2, 0.4, 0.4, 1.0 
        
    def init_v(self):
        self.v = np.zeros(self.N_states)
        
    def init_q(self):
        self.q = []
        for s in range(self.N_states - 1):
            self.q.append(np.zeros(len(self.policy[s])))
        self.q.append(0)
        
    def calc_bellman_v(self, s:int) -> float:
        return bellman_v(self.v, self.policy, self.Rsa, self.Psas, s=s)    
    
    def calc_bellman_q(self, s:int, a:int) -> float:
        #return 0
        return bellman_q(self.q, self.policy, self.Rsa, self.Psas, s=s, a=a)    
    
    def get_v(self, N_iter:int=10) -> np.ndarray:
        self.init_v()
        for n in range(N_iter):
            for s in range(self.N_states-1):
                self.v[s] = (self.v[s] * n + self.calc_bellman_v(s))/(n+1)        
        
        for s in range(self.N_states):
            print(f'v[{s}]={self.v[s]}')
        return self.v
    
    def get_q(self, N_iter:int=10) -> List:
        self.init_q()
        
        for n in range(N_iter):
            for s in range(self.N_states-1):
                for a in range(len(self.policy[s])):
                    #print(f'[?]s,a={s,a} --> {self.q[s][a]}')
                    self.q[s][a] = (self.q[s][a] * n + 
                        self.calc_bellman_q(s,a))/(n+1)  
                    #self.q[s][a] = (self.q[s][a] * n)/(n+1) 
        
        for s in range(self.N_states-1):
            for a in range(len(self.policy[s])):
                print(f'q[{s}][{a}]={self.q[s][a]}')
        return self.q      
    
    def test(self, N_Iter:int=100):
        print(f'Policy: {self.policy}')
        self.get_v(N_Iter)
        self.get_q(N_Iter)
             
MDP().test()

Policy: [array([0.5, 0.5]), array([0.5, 0.5]), array([0.5, 0.5]), array([0.5, 0.5])]
v[0]=-2.5057415905523253
v[1]=-1.6060272426565467
v[2]=2.4316473139847745
v[3]=7.135350237940245
v[4]=0.0
q[0][0]=-3.3266059231766705
q[0][1]=-1.6848772579279794
q[1][0]=-3.3516633390821924
q[1][1]=0.13960885376910126
q[2][0]=0.0
q[2][1]=4.863294627969549
q[3][0]=4.270700475880492
q[3][1]=10.0


In [4]:
import pandas as pd

In [5]:
S_df = pd.DataFrame({'S':[0,1,2,3,4]})
policy_df = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'pi':[1/3, 1/3, 1/3, 1, 1, 1]})
R_df = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'R':[1,0,-1/2,0,0,0]})
P_df = pd.DataFrame({'s':[0,0,0,0,0,1,2,3], 'a':[0,1,1,2,2,0,0,0], 'next_s':[4,1,2,3,4,4,4,4], 'P':[1,1/2,1/2,1/2,1/2,0,0,0]})

In [6]:
policy_df

Unnamed: 0,s,a,pi
0,0,0,0.333333
1,0,1,0.333333
2,0,2,0.333333
3,1,0,1.0
4,2,0,1.0
5,3,0,1.0


In [7]:
R_df

Unnamed: 0,s,a,R
0,0,0,1.0
1,0,1,0.0
2,0,2,-0.5
3,1,0,0.0
4,2,0,0.0
5,3,0,0.0


In [8]:
P_df

Unnamed: 0,s,a,next_s,P
0,0,0,4,1.0
1,0,1,1,0.5
2,0,1,2,0.5
3,0,2,3,0.5
4,0,2,4,0.5
5,1,0,4,0.0
6,2,0,4,0.0
7,3,0,4,0.0


In [9]:
S_df

Unnamed: 0,S
0,0
1,1
2,2
3,3
4,4


In [10]:
for a in set(P_df[P_df.s == 0].a):
    print(a)

0
1
2


## Pandas based Bellman

In [105]:
"""
두 클라스를 공통인 MDP에서 상속하게 만듬. 
L은 list를 이용하는 방식이고 PD는 pandas를 이용하는 방식임.
두 방식에 충돌이 나는 경우, 기반이 되는 MDP에는 공통 요소만 남기고 나머지는 MDP_L에 옮겨둘 예정임.
"""
import numpy as np
from typing import List
import pandas as pd

class MDP:
    def __init__(self):
        self.init_policy_Rsa_Psas()
        
    def init_policy_Rsa_Psas(self):
        # policy[state][action] = 0.9 (<-- prob)
        # None state is the last state        
        self.policy_actions_table = [['Facebook', 'Quit'], ['Facebook', 'Study'], 
            ['Sleep', 'Study'], ['Pub', 'Study'], None]
        self.Rsa = [[-1,0], [-1,-2], [0, -2], [1,10]]
        self.N_states = len(self.policy_actions_table)
        self.N_actions_in_s = []
        self.policy = []
        for actions in self.policy_actions_table:
            if actions:
                N_actions = len(actions)
                self.N_actions_in_s.append(N_actions)
                self.policy.append(np.ones(N_actions)/N_actions)
            else:
                self.N_actions_in_s.append(0)
                
        # policy가 고려되지 않은 관계임. Policy에 따른 가중에 별도로 고려되어야 함.
        self.Psas = np.zeros([self.N_states, 2, self.N_states]) # Probability
        self.Psas[0,0,0], self.Psas[0,1,1] = 1.0, 1.0
        self.Psas[1,0,0], self.Psas[1,1,2] = 1.0, 1.0
        self.Psas[2,0,4], self.Psas[2,1,3] = 1.0, 1.0
        self.Psas[3,0,1], self.Psas[3,0,2], \
            self.Psas[3,0,3], self.Psas[3,1,4] = 0.2, 0.4, 0.4, 1.0 
        
    def init_v(self):
        self.v = np.zeros(self.N_states)
        
    def init_q(self):
        self.q = []
        for s in range(self.N_states-1):
            self.q.append(np.zeros(self.N_actions_in_s[s]))
        self.q.append(0)
        print('q=', self.q)
    
    def get_v(self, N_iter:int=10) -> np.ndarray:
        self.init_v()
        for n in range(N_iter):
            for s in range(self.N_states-1):
                self.v[s] = (self.v[s] * n + self.bellman_v(s))/(n+1)        
        
        for s in range(self.N_states):
            print(f'v[{s}]={self.v[s]}')
        return self.v
    
    def get_q(self, N_iter:int=10) -> List:
        self.init_q()
        
        for n in range(N_iter):
            for s in range(self.N_states-1):
                for a in range(self.N_actions_in_s[s]): 
                    #print(f'[?]s,a={s,a} --> {self.q[s][a]}')
                    self.q[s][a] = (self.q[s][a] * n + 
                        self.bellman_q(s,a))/(n+1)  
                    #self.q[s][a] = (self.q[s][a] * n)/(n+1) 
        
        for s in range(self.N_states-1):
            for a in range(self.N_actions_in_s[s]):
                print(f'q[{s}][{a}]={self.q[s][a]}')
        return self.q      
    
    def bellman_v(self, s:int=0, forgetting_factor:float=1.0) -> float:
        Gs = 0
        for a in range(len(self.policy[s])):
            Gs += self.policy[s][a] * self.bellman_q_by_v(
                s=s, a=a, forgetting_factor=forgetting_factor)
        return Gs
    
    def bellman_q_by_v(self, s:int=0, a:int=0, forgetting_factor:float=1.0) -> float:
        reward = self.Rsa[s][a]
        v_next = 0
        for next_s in range(len(self.Psas[s,a])):
            if self.Psas[s,a,next_s] and next_s < len(self.v) - 1:
                v_next += self.Psas[s,a,next_s] * self.v[next_s]     
        Gs = reward + forgetting_factor * v_next
        return Gs

    def bellman_q(self, s:int=0, a:int=0, forgetting_factor:float=1.0) -> float:
        reward = self.Rsa[s][a]
        v_next = 0
        for next_s in range(len(self.Psas[s,a])):
            if self.Psas[s,a,next_s] and next_s < len(self.q) - 1:
                v = 0
                for next_a in range(len(self.policy[next_s])):
                    v += self.policy[next_s][next_a] * self.q[next_s][next_a]
                v_next += self.Psas[s,a,next_s] * v     
        Gs = reward + forgetting_factor * v_next
        return Gs
    
    def test(self, N_Iter:int=100):
        print(f'Policy: {self.policy}')
        self.get_v(N_Iter)
        self.get_q(N_Iter)

class MDP_L(MDP):
    def __init__(self):
        super().__init__()

class MDP_PD(MDP):
    def __init__(self):
        super().__init__()
        
    def init_policy_Rsa_Psas(self):
        S_df = pd.DataFrame({'S':[0,1,2,3,4]})
        self.policy = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'pi':[1/3, 1/3, 1/3, 1, 1, 1]})
        self.Rsa = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'R':[1,0,-1/2,0,0,0]})
        self.Psas = pd.DataFrame({'s':[0,0,0,0,0,1,2,3], 'a':[0,1,1,2,2,0,0,0], 'next_s':[4,1,2,3,4,4,4,4], 'P':[1,1/2,1/2,1/2,1/2,0,0,0]})
        self.N_states = len(S_df)
        self.N_actions_in_s = []
        for s in range(self.N_states):
            self.N_actions_in_s.append(len(self.policy.a[self.policy.s==s]))
        
    def bellman_v(self, s:int=0, forgetting_factor:float=1.0) -> float:
        Gs = 0
        policy_s = self.policy[self.policy.s == s]
        for a in set(policy_s.a):
            policy = policy_s.pi[policy_s.a == a].to_numpy()[0]
            Gs += policy * self.bellman_q_by_v(
                s=s, a=a, forgetting_factor=forgetting_factor)
        # print('Gs =', Gs)
        # print('policy =', policy)
        return Gs
    
    def bellman_q_by_v(self, s:int=0, a:int=0, forgetting_factor:float=1.0) -> float:
        Rsa = self.Rsa
        Psas = self.Psas
        reward = Rsa.R[(Rsa['s']==0) & (Rsa['a']==0)][0]
        v_next = 0
        Psas_sa = Psas[(Psas.s==s) & (Psas.a==a)]
        for next_s in set(Psas_sa.next_s):
            if len(Psas_sa.P[Psas_sa.next_s==next_s]) and next_s < self.N_states - 1:
                # print(Psas_sa.P[Psas_sa.next_s==next_s],Psas_sa.P[Psas_sa.next_s==next_s].to_numpy()[0])
                v_next += Psas_sa.P[Psas_sa.next_s==next_s].to_numpy()[0] * self.v[next_s]     
        Gs = reward + forgetting_factor * v_next
        return Gs

    def bellman_q(self, s:int=0, a:int=0, forgetting_factor:float=1.0) -> float:
        Rsa = self.Rsa
        Psas = self.Psas
        reward = Rsa.R[(Rsa['s']==0) & (Rsa['a']==0)][0]
        v_next = 0
        Psas_sa = Psas[(Psas.s==s) & (Psas.a==a)]
        for next_s in set(Psas_sa.next_s):
            if len(Psas_sa.P[Psas_sa.next_s==next_s]) and next_s < self.N_states - 1:
                v = 0
                policy_s = self.policy[self.policy.s == next_s]
                for next_a in set(policy_s.a):
                    policy = policy_s.pi[policy_s.a == next_a].to_numpy()[0]
                    v += policy * self.q[next_s][next_a]
                v_next += Psas_sa.P[Psas_sa.next_s==next_s].to_numpy()[0] * v     
        Gs = reward + forgetting_factor * v_next
        return Gs
    
    """
    동일한 루틴이 사용 가능한 경우임. 
    def init_v(self):
        self.v = np.zeros(self.N_states)
        
    def get_v(self, N_iter:int=10) -> np.ndarray:
        self.init_v()
        for n in range(N_iter):
            for s in range(self.N_states-1):
                self.v[s] = (self.v[s] * n + self.calc_bellman_v(s))/(n+1)        
        
        for s in range(self.N_states):
            print(f'v[{s}]={self.v[s]}')
        return self.v
        
    def calc_bellman_v(self, s:int) -> float:
        return bellman_v(self.v, self.policy, self.Rsa, self.Psas, s=s)    
    """

MDP_L().test()

MDP_PD().test()

Policy: [array([0.5, 0.5]), array([0.5, 0.5]), array([0.5, 0.5]), array([0.5, 0.5])]
v[0]=-2.5057415905523253
v[1]=-1.6060272426565467
v[2]=2.4316473139847745
v[3]=7.135350237940245
v[4]=0.0
q= [array([0., 0.]), array([0., 0.]), array([0., 0.]), array([0., 0.]), 0]
q[0][0]=-3.3266059231766705
q[0][1]=-1.6848772579279794
q[1][0]=-3.3516633390821924
q[1][1]=0.13960885376910126
q[2][0]=0.0
q[2][1]=4.863294627969549
q[3][0]=4.270700475880492
q[3][1]=10.0
Policy:    s  a        pi
0  0  0  0.333333
1  0  1  0.333333
2  0  2  0.333333
3  1  0  1.000000
4  2  0  1.000000
5  3  0  1.000000
v[0]=1.495
v[1]=1.0
v[2]=1.0
v[3]=1.0
v[4]=0.0
q= [array([0., 0., 0.]), array([0.]), array([0.]), array([0.]), 0]
q[0][0]=1.0
q[0][1]=1.99
q[0][2]=1.495
q[1][0]=1.0
q[2][0]=1.0
q[3][0]=1.0


In [12]:
S_df = pd.DataFrame({'S':[0,1,2,3,4]})
policy_df = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'pi':[1/3, 1/3, 1/3, 1, 1, 1]})
R_df = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'R':[1,0,-1/2,0,0,0]})
P_df = pd.DataFrame({'s':[0,0,0,0,0,1,2,3], 'a':[0,1,1,2,2,0,0,0], 'next_s':[4,1,2,3,4,4,4,4], 'P':[1,1/2,1/2,1/2,1/2,0,0,0]})

In [13]:
len(S_df)

5

In [30]:
(R_df[R_df.s == 0])[R_df.a == 0]

Unnamed: 0,s,a,R
0,0,0,1.0
1,0,1,0.0
2,0,2,-0.5


In [25]:
R_df.s == 0

0     True
1     True
2     True
3    False
4    False
5    False
Name: s, dtype: bool

In [26]:
R_df.a == 0

0     True
1    False
2    False
3     True
4     True
5     True
Name: a, dtype: bool

In [32]:
R_df.loc[0]

s    0.0
a    0.0
R    1.0
Name: 0, dtype: float64

In [35]:
R_df['s']

0    0
1    0
2    0
3    1
4    2
5    3
Name: s, dtype: int64

In [36]:
stock_df = pd.DataFrame({'symbol': ['AAPL', 'AAPL', 'F', 'F', 'F'], 
                         'date': ['2016-1-1', '2016-1-2', '2016-1-1', '2016-1-2', '2016-1-3'], 
                         'price': [100., 101, 50, 47.5, 49]}).set_index(['symbol', 'date'])

In [37]:
stock_df

Unnamed: 0_level_0,Unnamed: 1_level_0,price
symbol,date,Unnamed: 2_level_1
AAPL,2016-1-1,100.0
AAPL,2016-1-2,101.0
F,2016-1-1,50.0
F,2016-1-2,47.5
F,2016-1-3,49.0


In [39]:
stock_df.loc['AAPL','2016-1-1']

price    100.0
Name: (AAPL, 2016-1-1), dtype: float64

In [40]:
R_df

Unnamed: 0,s,a,R
0,0,0,1.0
1,0,1,0.0
2,0,2,-0.5
3,1,0,0.0
4,2,0,0.0
5,3,0,0.0


In [62]:
R_df.R[(R_df['s']==0) & (R_df['a']==0)].to_numpy()[0]

1.0

In [58]:
x = R_df.R[(R_df['s']==0) & (R_df['a']==0)]
x

0    1.0
Name: R, dtype: float64

In [61]:
x.to_numpy()

array([1.])

In [64]:
R_df.R[(R_df.s==0) & (R_df.a==0)].to_numpy()[0]

1.0

In [83]:
S_df = pd.DataFrame({'S':[0,1,2,3,4]})
policy = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'pi':[1/3, 1/3, 1/3, 1, 1, 1]})
Rsa = pd.DataFrame({'s':[0,0,0,1,2,3], 'a':[0,1,2,0,0,0], 'R':[1,0,-1/2,0,0,0]})
Psas = pd.DataFrame({'s':[0,0,0,0,0,1,2,3], 'a':[0,1,1,2,2,0,0,0], 'next_s':[4,1,2,3,4,4,4,4], 'P':[1,1/2,1/2,1/2,1/2,0,0,0]})

s, a, next_s = 0, 0, 4
Psas_sa = Psas[(Psas.s==s) & (Psas.a==a)]
if len(Psas_sa.P[Psas_sa.next_s==next_s]):
    print('Hi')

Hi


In [81]:
len(Psas_sa.P[Psas_sa.next_s==next_s])

0