In [97]:
import pandas as pd
import seaborn as sns
from datetime import datetime
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from sklearn.model_selection import train_test_split
seed = 17
np.random.seed(seed)

# read data
data = pd.read_csv('data.csv')

# rename and remove
data = data.rename(index=str, columns={"activity id": "activity_id"}).drop(['activity_name'], axis=1)

# convert to datetime
data['date_booking'] = data['date_booking'].apply(lambda x: datetime.strptime(x,'%d/%m/%y'))

data_old = data

In [98]:
# clean data
data = data_old
keep_act = sorted(data.activity_id.value_counts().index[:50].tolist())
data['keep'] = data['activity_id'].apply(lambda x: x in keep_act)

data = data[data['keep']==True]

# should split data by country to improve efficiency of model tranining.
# a seperate cross-country net can be trained

In [99]:
data.describe()

Unnamed: 0,order_id,guest_id,activity_id
count,23687.0,23687.0,23687.0
mean,98649.553764,121904.46916,5356.660995
std,18023.692658,23234.439161,3353.577164
min,42650.0,29.0,405.0
25%,85118.5,106392.0,1625.0
50%,100061.0,125118.0,4654.0
75%,113932.5,141031.5,8667.0
max,127257.0,155513.0,10369.0


## Create dataset

In [100]:
# create a dictionary of guests with values equal to their ordered ticket purchases
test_guests = data.groupby(['guest_id'])['order_id'].count()
test_guests = test_guests[test_guests > 1].index.tolist()
# build activity lists
guest_activities = dict(zip(test_guests, [[] for i in range(0, len(test_guests))]))
data_sorted = data.sort_values(['date_booking'], axis=0)
data_sorted = data_sorted[['guest_id', 'activity_id']]
for index, row in data_sorted.iterrows():
    if row['guest_id'] in test_guests:
        guest_activities[row['guest_id']] += [row['activity_id']]


In [101]:
def list_to_cols(l, cols):
    return [l.count(i) for i in cols]

# build list of test cases which contain a tuple with 1-many activities as list input and one activity as output
test_cases = []
for key in guest_activities.keys():
    for i in range(0, len(guest_activities[key])-1):
        test_cases += [(guest_activities[key][:i+1], guest_activities[key][i+1])]

cols = sorted(list(data.activity_id.unique()))
new = pd.DataFrame(columns=cols)
new_out = pd.DataFrame(columns=cols)

num = len(test_cases)

for i in range(0, num): #len(test_cases)):
    # convert list of inputs to columns
    t = list_to_cols(test_cases[i][0], cols)
    new = new.append(pd.DataFrame([t],columns=cols))
    # for predictions, just set all to 0 then update specific cell to 1
    t = [0 for i in range(len(cols))]
    new_out = new_out.append(pd.DataFrame([t],columns=cols))
    new_out.iloc[i][test_cases[i][1]] = 1
    if (i % 200 == 0):
        print(i/num)

print(test_cases[:20])

X = new.values
Y = new_out.values

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
[([7245], 4654), ([7245, 4654], 7092), ([4654], 9392), ([1205], 10277), ([1625], 1625), ([2703], 2703), ([2703, 2703], 2703), ([10277], 10277), ([1205], 10277), ([1757], 10277), ([2703], 2703), ([2703, 2703], 2703), ([1119], 7496), ([6977], 9392), ([6977, 9392], 9461), ([634], 4250), ([6588], 1205), ([7014], 7092), ([2635], 2635), ([10277], 1757)]


In [102]:
# split data into training and testing 67/33
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size=0.33, random_state=seed)

X_train.shape

(1688, 50)

## Define Model

In [103]:
model = Sequential()
model.add(Dense(64, input_dim=50, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(50, activation='softmax')) # sigmoid

In [104]:
from keras.metrics import top_k_categorical_accuracy

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[top_k_categorical_accuracy])

model.fit(X_train, Y_train, epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150


Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150


Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<keras.callbacks.History at 0x241acf97f60>

In [105]:
# evaluate the model
scores = model.evaluate(X_test, Y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# calculate predictions
predictions = model.predict(X_test)
# round predictions
predictions
rounded = [round(x[0]) for x in predictions]
print(predictions[0])
for i in range(10):
    print(keep_act[X_test[i].tolist().index(1)])
    print(keep_act[Y_test[i].tolist().index(1)])
    print([keep_act[i[1]] for i in sorted(zip(predictions[i], range(len(predictions[i]))), reverse=True)[:5]])

 32/832 [>.............................] - ETA: 3s
top_k_categorical_accuracy: 69.95%
[  1.02499534e-05   7.35742958e-07   1.24410326e-06   4.36422852e-05
   3.39740480e-04   6.71843372e-05   5.60596697e-02   2.65394688e-01
   1.43591515e-05   1.09788800e-06   1.43961927e-06   3.98209295e-07
   1.69897430e-05   1.22278436e-08   1.59725005e-06   2.15300433e-02
   1.47665082e-03   5.66214435e-02   7.72743590e-07   1.77784439e-03
   1.53476522e-05   7.17384364e-06   1.41341744e-07   1.56825911e-06
   6.48491323e-07   4.05740830e-05   1.51013537e-05   9.54930410e-02
   1.58540365e-06   7.33249326e-05   3.40921861e-06   1.14859522e-05
   1.09005969e-05   3.11444805e-04   8.57950151e-02   1.59801051e-01
   1.67885914e-01   2.65924409e-05   8.31785798e-03   1.07136900e-02
   3.30807234e-05   2.47860182e-04   1.99703407e-03   5.09628677e-04
   2.76107487e-04   1.47449697e-04   4.42055836e-02   3.85957719e-05
   4.09657322e-03   1.65623985e-02]
4654
7092
[1625, 7092, 7051, 4654, 7014]
10277
102