<a href="https://colab.research.google.com/github/iirusia/Reinforcement-learning/blob/main/finalproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from google.colab import drive

class PricingEnv:
    def __init__(self):
        self.min_price = 400
        self.max_price = 1000
        self.nearby_price = 1000
        self.action_space = np.arange(self.min_price, self.max_price + 1)
        self.state = self.reset()

    def reset(self):
        self.state = [np.random.uniform(8, 15), np.random.choice([0, 1]), np.random.randint(100, 10000)]
        return self.state

    def step(self, action):
        min_wage, weather, population = self.state
        new_price = action

        if weather == 1:  # 흐림
            adjusted_population = population * 0.6  # 40% 감소
        else:
            adjusted_population = population

        sales = new_price * adjusted_population * np.random.uniform(0.5, 1.5)
        cost = min_wage * population * 0.1
        reward = sales - cost

        if new_price < self.nearby_price * 0.5:
            reward -= (self.nearby_price * 0.5 - new_price) * population

        new_min_wage = min_wage + np.random.uniform(-0.5, 0.5)
        new_population = population + np.random.randint(-1000, 1000)
        new_population = max(100, new_population)  # 인구수는 최소 100 이상

        self.state = [new_min_wage, weather, new_population]
        done = True
        return self.state, reward, done, {}

def build_model(state_shape, action_shape):
    model = tf.keras.Sequential([
        layers.InputLayer(input_shape=state_shape),
        layers.Dense(48, activation='relu'),
        layers.Dense(48, activation='relu'),
        layers.Dense(action_shape, activation='softmax')
    ])
    return model

class PolicyGradientAgent:
    def __init__(self, state_shape, action_shape, learning_rate=0.0005):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.model = build_model(state_shape, action_shape)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.action_space = np.arange(self.action_shape)

    def choose_action(self, state):
        state = np.reshape(state, [1, self.state_shape[0]])
        probabilities = self.model(state).numpy().flatten()
        probabilities = np.nan_to_num(probabilities, nan=1e-10)
        probabilities = probabilities / np.sum(probabilities)
        action = np.random.choice(self.action_space, p=probabilities)
        return action.item()

    def train(self, states, actions, rewards):
        with tf.GradientTape() as tape:
            action_probabilities = self.model(np.array(states))
            indices = np.array(actions)
            action_masks = tf.one_hot(indices, self.action_shape)
            log_probs = tf.reduce_sum(action_masks * tf.math.log(action_probabilities + 1e-10), axis=1)
            loss = -tf.reduce_mean(log_probs * rewards)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

env = PricingEnv()
state_shape = (3,)
action_shape = len(env.action_space)
num_models = 10
episodes_per_model = 100
gamma = 0.99
performance_metrics = []
price_predictions = []

def predict_price(agent, min_wage, weather, population):
    state = [min_wage, weather, population]
    action = agent.choose_action(state)
    predicted_price = env.action_space[action]
    return predicted_price

for model_index in range(num_models):
    agent = PolicyGradientAgent(state_shape, action_shape)
    total_rewards = []

    for episode in range(episodes_per_model):
        state = env.reset()
        states = []
        actions = []
        rewards = []
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(env.action_space[action])
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            state = next_state

        discounted_rewards = []
        cumulative_reward = 0
        for reward in reversed(rewards):
            cumulative_reward = reward + gamma * cumulative_reward
            discounted_rewards.insert(0, cumulative_reward)

        discounted_rewards = np.array(discounted_rewards)
        discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-10)

        agent.train(states, actions, discounted_rewards)
        total_rewards.append(np.sum(rewards))

    avg_reward = np.mean(total_rewards)
    performance_metrics.append(avg_reward)
    print(f"Model {model_index + 1}: Reward = {np.sum(total_rewards)}, Average Reward = {avg_reward}")

    # 가격 예측
    min_wage = 12000
    weather = 0
    population = 500000
    predicted_price = predict_price(agent, min_wage, weather, population)
    price_predictions.append(predicted_price)
    print(f"Predicted price for products after {episodes_per_model * (model_index + 1)} episodes: {predicted_price}원")

# 평가 지표 출력
print("Performance metrics per 100 episodes:")
for i, metric in enumerate(performance_metrics):
    print(f"100 episodes from {i*100 + 1} to {(i+1)*100}: Average Reward = {metric}")

# 가격 예측 출력
print("Predicted prices per 100 episodes:")
for i, price in enumerate(price_predictions):
    print(f"After {i*100 + 100} episodes: Predicted price = {price}원")


Model 1: Reward = 342419523.32117474, Average Reward = 3424195.2332117474
Predicted price for products after 100 episodes: 891원
Model 2: Reward = 301014868.4667084, Average Reward = 3010148.6846670844
Predicted price for products after 200 episodes: 759원
Model 3: Reward = 243066889.54307655, Average Reward = 2430668.8954307656
Predicted price for products after 300 episodes: 589원
Model 4: Reward = 191661650.07839215, Average Reward = 1916616.5007839215
Predicted price for products after 400 episodes: 515원
Model 5: Reward = 285389086.21666557, Average Reward = 2853890.8621666557
Predicted price for products after 500 episodes: 461원
Model 6: Reward = 250852140.15206975, Average Reward = 2508521.4015206974
Predicted price for products after 600 episodes: 566원
Model 7: Reward = 357649197.36136913, Average Reward = 3576491.9736136915
Predicted price for products after 700 episodes: 923원
Model 8: Reward = 298371321.1289268, Average Reward = 2983713.211289268
Predicted price for products afte