# We will study the algorithm on the synthetic data

In [14]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import random


# Load the synthetic data

In [15]:
synthetic_data = pd.read_csv('synthetic_ratings.csv')

In [16]:
print(synthetic_data.head())
print(len(synthetic_data))


   Unnamed: 0  UserID  MovieID  rating  binary_rating
0           0       0        0       5              1
1           1       0        1       1              0
2           2       0        2       4              1
3           3       0        3       5              1
4           4       0        4       1              0
5000000


In [None]:

class EpsilonGreedyRecommender:
    def __init__(self, df, epsilon=1.0, decay=True):
        """
        df: DataFrame with columns ['user_id','movie_id','rating']
        epsilon: initial exploration rate
        decay: if True, apply decay schedule epsilon ~ t^(-1/3)
        """
        self.df = df
        self.movie_ids = df['movie_id'].unique().tolist()
        self.K = len(self.movie_ids)

        self.epsilon_0 = epsilon
        self.epsilon = epsilon
        self.decay = decay
        self.t = 0

        # Estimated mean rewards (ratings)
        self.q_values = {m_id: 0.0 for m_id in self.movie_ids}
        self.attempts = {m_id: 0 for m_id in self.movie_ids}

        # Prepare a dictionary of ratings per movie
        self.ratings_dict = df.groupby('movie_id')['rating'].apply(list).to_dict()

        # For regret tracking
        self.optimal_mean = df.groupby('movie_id')['rating'].mean().max()
        self.regret_list = []

    def select_movie(self):
        """Epsilon-greedy selection"""
        if random.random() < self.epsilon:
            return random.choice(self.movie_ids)
        else:
            return max(self.q_values, key=self.q_values.get)

    def get_reward(self, movie_id):
        """Sample a rating from the movie's list"""
        return random.choice(self.ratings_dict[movie_id])

    def update_scores(self, movie_id, reward):
        """Incremental update of estimated mean rating"""
        self.attempts[movie_id] += 1
        n = self.attempts[movie_id]
        old_q = self.q_values[movie_id]
        self.q_values[movie_id] = old_q + (reward - old_q) / n

    def step(self):
        """Perform one iteration of epsilon-greedy"""
        self.t += 1
        if self.decay:
            self.epsilon = self.epsilon_0 * (self.t ** (-1/3))

        movie = self.select_movie()
        reward = self.get_reward(movie)
        self.update_scores(movie, reward)

        # Update regret
        instant_regret = self.optimal_mean - reward
        if self.regret_list:
            self.regret_list.append(self.regret_list[-1] + instant_regret)
        else:
            self.regret_list.append(instant_regret)

        return movie, reward

# --- Example usage ---

# Suppose you have a smaller subset for testing
df = interactions_df.sample(n=500_000, random_state=42)  # sample for speed

recommender = EpsilonGreedyRecommender(df, epsilon=1.0, decay=True)

T = 50_000  # number of iterations
for _ in range(T):
    recommender.step()

# Plot cumulative regret
plt.figure(figsize=(8,5))
plt.plot(recommender.regret_list)
plt.xlabel("Iterations")
plt.ylabel("Cumulative Regret")
plt.title("Epsilon-Greedy: Cumulative Regret over Iterations")
plt.grid(True)
plt.show()
