In [1]:
import os
import numpy as np
import pandas as pd

DATA_PATH = '/Users/kevinkurek/Desktop/GitHub/data-science/workspace/kevinkurek/RL/data/ml-25m'

def read_movielens_25m():
    ratings = pd.read_csv(os.path.join(DATA_PATH,'ratings.csv'), engine='python')
    movies = pd.read_csv(os.path.join(DATA_PATH,'movies.csv'), engine='python')
    movies = movies.join(movies.genres.str.get_dummies().astype(bool))
    movies.drop('genres', inplace=True, axis=1)
    df = ratings.join(movies, on='movieId', how='left', rsuffix='_movie')
    return df

def preprocess_movielens_25m(df, min_number_of_reviews=20000):
    # remove ratings of movies with < N ratings. too few ratings will cause the recsys to get stuck in offline evaluation
    movies_to_keep = pd.DataFrame(df.movieId.value_counts())\
        .loc[pd.DataFrame(df.movieId.value_counts())['movieId']>=min_number_of_reviews].index
    df = df.loc[df['movieId'].isin(movies_to_keep)]
    # shuffle rows to debias order of user ids
    df = df.sample(frac=1)
    # create a 't' column to represent time steps for the bandit to simulate a live learning scenario
    df['t'] = np.arange(len(df))
    df.index = df['t']
    # rating >= 4.5 stars is a 'like', < 4 stars is a 'dislike'
    df['liked'] = df['rating'].apply(lambda x: 1 if x >= 4.5 else 0)
    return df

def get_ratings_25m(min_number_of_reviews=20000):
    df = read_movielens_25m()
    df = preprocess_movielens_25m(df, min_number_of_reviews=20000)
    return df

df = get_ratings_25m()
df = df[['timestamp', 'movieId', 'userId', 'liked']]
df.head()

KeyboardInterrupt: 

In [2]:
def replay_score(history, df, t, batch_size, recs):
    """
    https://arxiv.org/pdf/1003.5956.pdf
    replay score. reward if rec matches logged data, ignore otherwise.
    I.E. If MAB recommended 5 movies & historical viewer data showed they watched 3 of the 5 then we'd
    only pull the rewards for the 3 movies they played; [1, 0, 1] = liked, disliked, liked for example.
    """
    # reward if rec matches logged data, ignore otherwise
    actions = df[t:t+batch_size] # 100 possible movie matches at once
    # Core of "Reply": Matching our bandit policy recommendations with actual viewer content at current timestep
    actions = actions.loc[actions['movieId'].isin(recs)] # Number out of 100 movies that matched
    actions['scoring_round'] = t
    # add row to history if recs match logging policy
    history = history.append(actions) # cumulatively grows as algo steps through time
    action_liked = actions[['movieId', 'liked']]
    return history, action_liked

def ucb1_policy(df, t, ucb_scale=2.0, slate_size=5, bayes=True):
    '''
    Applies UCB1 policy to generate movie recommendations
    Args:
        df: dataframe. Dataset to apply UCB policy to.
        ucb_scale: float. Most implementations use 2.0.
        t: int. represents the current time step.
    '''
    scores = df[['movieId', 'liked']].groupby('movieId').agg({'liked': ['mean', 'count', 'std']})
    scores.columns = ['mean', 'count', 'std']
    if bayes:
        # Bayes UCB
        scores['ucb'] = scores['mean'] + (ucb_scale * scores['std'] / np.sqrt(scores['count']))
    else:
        # Regular UCB
        scores['ucb'] = scores['mean'] + np.sqrt(
                ( (2 * np.log10(t)) /
                    scores['count'] )
                )
    scores['movieId'] = scores.index
    scores = scores.sort_values('ucb', ascending=False)
    recs = scores.loc[scores.index[0:slate_size], 'movieId'].values
    return recs

# simulation params: slate size, batch size (number of events per training iteration)
slate_size = 5 # ACTION: number of recommendations to show at once
batch_size = 100 # STEP: number of values to step through at a time (note that 1 step at a time isn't efficient)

# df = get_ratings_25m(min_number_of_reviews=1500)

# initialize empty history for random policy
# (the algorithm should be able to see all events and outcomes prior to the current timestep, but no current or future outcomes)
# history = pd.DataFrame(data=None, columns=df.columns)
# history = history.astype({'movieId': 'int32', 'liked': 'float'})

# initialize history for UCB Policy
# initialze history with 50% like rate, 8 ratings
# this avoids stddev errors and prioritizes exploration of new posts in early iterations
history = df.groupby('movieId').first()
history['movieId'] = history.index
history['t'] = 0
history.index = history['t']
history['liked'] = 1
history = history[df.columns] # reorder columns to match logged data
history2 = history.copy()
history2['liked'] = 0
history = history.append(history).append(history2).append(history2).append(history)
history['scoring_round'] = 0
display(history.head())

# to speed this up, retrain the bandit every batch_size time steps
# this lets us measure batch_size actions against a slate of recommendations rather than generating
#      recs at each time step. this becomes necessary to reach a useful sample size with replay evaluation
ucb_history = pd.DataFrame(data=None, columns = ['mean', 'count', 'std', 'ucb', 'movieId', 'iter']) # for post-analysis of ucbs over iterations
max_time = df.shape[0] # total number of ratings to evaluate using the bandit
print('Running algorithm')

# initialize empty list for storing scores from each step
rewards = []

for t in range(max_time//batch_size):
    t = t * batch_size
    # POLICY: generate recommendations from a random policy
#     recs = np.random.choice(df.movieId.unique(), size=(slate_size), replace=False)

    # POLICY: Bayesian UCB
    recs = ucb1_policy(df=history.loc[history.index<=t,], t=t/batch_size, ucb_scale=2.0, slate_size=slate_size, bayes=True)
    print(recs) # 5 random movie ids
    # send recommendations and dataset to a scoring function so the model can learn & adjust its policy in the next iteration
    history, action_score = replay_score(history, df, t, batch_size, recs)
    print(f"Current history: \n {history} \n")
    print(f"Current action: \n {action_score} \n")
    
    if action_score is not None:
        action_score = action_score.liked.tolist() # REWARD Pulls out 1 or 0, liked or disliked
        rewards.extend(action_score)
        print(f"Cumulative Rewards at time t = {t}: \n {rewards} \n")  
        

NameError: name 'df' is not defined

In [None]:
def ucb1_policy(df, t, ucb_scale=2.0, bayes=True):
    '''
    Applies UCB1 policy to generate movie recommendations
    Args:
        df: dataframe. Dataset to apply UCB policy to.
        ucb_scale: float. Most implementations use 2.0.
        t: int. represents the current time step.
    '''
    scores = df[['movieId', 'liked']].groupby('movieId').agg({'liked': ['mean', 'count', 'std']})
    scores.columns = ['mean', 'count', 'std']
    if bayes:
        scores['ucb'] = scores['mean'] + (ucb_scale * scores['std'] / np.sqrt(scores['count']))
    else:
        scores['ucb'] = scores['mean'] + np.sqrt(
                (
                    (2 * np.log10(t)) /
                    scores['count']
                )
            )
    scores['movieId'] = scores.index
    scores = scores.sort_values('ucb', ascending=False)
    recs = scores.loc[scores.index[0:args.n], 'movieId'].values
    return recs

recs = ucb1_policy(df=history.loc[history.t<=t,], t, ucb_scale=args.ucb_scale)
history, action_score = score(history, df, t, args.batch_size, recs)