In [64]:
import numpy as np
import pandas as pd
import math

In [66]:
def get_data(df):
    df = df.groupby(['Date', 'Keyword'])['Metric'].sum().reset_index()
    df['Date'] = pd.to_datetime(df['Date'])
    res = df.pivot(index='Date', columns='Keyword', values='Metric').sort_values(by='Date')
    return res

In [68]:
class UCBAlgorithm:
    def __init__(self, N, data, normalize=True):
        self.N = N #Number of days used. The shorter the timeframe, the quicker the algorithm makes decisions, but it also increases the likelihood of errors.
        self.data = data #data
        self.d = data.shape[1] #number of options (keywords)
        self.normalize_data = normalize #Normalizing speeds up the algorithm's ability to select winners, but it also raises the chances of declaring too early.
        self.keywords_selected = 0

    def normalize(self):
        return self.data.apply(lambda x: (x>self.data.mean(axis=1))*1)

    def run(self):
        keywords_selected = []
        numbers_of_selections = [0] * self.d
        sums_of_reward = [0] * self.d
        total_reward = 0

        if self.normalize_data:
            data = self.normalize()
        else:
            data = self.data

        for n in range(0, self.N):
            kw = 0
            max_upper_bound = 0
            for i in range(0, self.d):
                if (numbers_of_selections[i] > 0):
                    average_reward = sums_of_reward[i] / numbers_of_selections[i]
                    delta_i = math.sqrt(2 * math.log(n+1) / numbers_of_selections[i])
                    upper_bound = average_reward + delta_i
                else:
                    upper_bound = 1e400
                if upper_bound > max_upper_bound:
                    max_upper_bound = upper_bound
                    kw = i
            keywords_selected.append(kw)
            numbers_of_selections[kw] += 1
            reward = data.values[n, kw]
            sums_of_reward[kw] += reward
            total_reward += reward
        self.keywords_selected = keywords_selected

    def get_proportions(self):
        result = pd.Series(self.keywords_selected).value_counts(normalize=True)
        return result


In [72]:
df = pd.read_csv('Fiore Sample Data.csv') #you can download data here: https://docs.google.com/spreadsheets/d/1rokrBuH9UD_9xSKmKwJsygBUQ__2brWIl61KqUrOuDM/edit?gid=0#gid=0
res = get_data(df)

In [73]:
x = UCBAlgorithm(N=7, data=res)

In [74]:
x.run()

In [75]:
x.get_proportions()

Unnamed: 0,proportion
1,0.714286
0,0.142857
2,0.142857


Using the first 7 days of the sample dataset. This tells us to give more preference to keyword1.



In [56]:
z = UCBAlgorithm(7, res, False)

In [57]:
z.run()

In [58]:
z.get_proportions()

Unnamed: 0,proportion
1,0.571429
2,0.285714
0,0.142857


Using the first 7 days of the sample dataset, we see that we should give more preference to keyword1. Since we are not normalizing the dataset, our estimate is more conservative. Not normalizing is a better option if you want to wait longer and ensure you're truly selecting the better keyword in the long run.