In [1]:
import gzip
import itertools
import collections
import numpy as np
import item2vec

In [2]:
np.random.seed(101)

In [3]:
list_of_sessions_with_position_opening_ids = []
with gzip.open('data/view_position_opening_sessions.txt.gz', 'rb') as f:
    for line in f:
        list_of_sessions_with_position_opening_ids.append(list(set(line.strip().split())))

In [4]:
postion_opening_id_2_location_name = {}
with gzip.open('data/postion_opening_categorical_variables.csv.gz', 'rb') as f:
    for line in f:
        vac_id, _, location_name = line.strip().split(",")
        postion_opening_id_2_location_name[vac_id] = location_name

In [5]:
list_of_sessions_with_location_names_unfiltered = \
    [set(map(lambda pos_opening_id: postion_opening_id_2_location_name[pos_opening_id], session) )
         for session in list_of_sessions_with_position_opening_ids]

In [6]:
list_of_sessions_with_location_names_flattened = \
    itertools.chain.from_iterable(list_of_sessions_with_location_names_unfiltered)

In [7]:
frequently_viewed_locations = [location_name for location_name, count in \
     collections.Counter(list_of_sessions_with_location_names_flattened).items() if count > 200]

In [8]:
location_name_to_index = dict(zip(frequently_viewed_locations, range(len(frequently_viewed_locations))))

In [9]:
list_of_sessions_with_location_indices = []
for session in list_of_sessions_with_location_names_unfiltered:
    only_frequently_viewed_in_session = session.intersection(frequently_viewed_locations)
    if len(only_frequently_viewed_in_session) > 1:
        list_of_sessions_with_location_indices.append(
            map(lambda location_name: location_name_to_index[location_name], list(only_frequently_viewed_in_session))
        )

In [10]:
batch_size = 128
next_batch = item2vec.create_batch_generator(list_of_sessions_with_location_indices, batch_size)
x = item2vec.run(750001, next_batch, 0.5, batch_size, 100, len(frequently_viewed_locations), 15)

2016-09-09 16:03:41.279313 Average loss at step 0: 191.926345825

2016-09-09 16:06:02.716560 Average loss at step 20000: 4.12253415415

2016-09-09 16:08:22.346069 Average loss at step 40000: 3.55853750117

2016-09-09 16:10:47.872898 Average loss at step 60000: 3.54658460959

2016-09-09 16:13:13.622302 Average loss at step 80000: 3.53928644782

2016-09-09 16:15:38.641889 Average loss at step 100000: 3.53617465142

2016-09-09 16:18:04.482675 Average loss at step 120000: 3.53224128134

2016-09-09 16:20:30.840844 Average loss at step 140000: 3.53035159198

2016-09-09 16:22:55.308304 Average loss at step 160000: 3.528927601

2016-09-09 16:25:14.637997 Average loss at step 180000: 3.52931641765

2016-09-09 16:27:37.995939 Average loss at step 200000: 3.52876839567

2016-09-09 16:29:59.223364 Average loss at step 220000: 3.52790833496

2016-09-09 16:32:19.822820 Average loss at step 240000: 3.52827636063

2016-09-09 16:34:39.572953 Average loss at step 260000: 3.52589493797

2016-09-09 16:36:

In [11]:
with open("work/location_vectors.csv","w") as f:
    for i in range(x.shape[0]):
        f.write(str(frequently_viewed_locations[i]) + ",")
        f.write(",".join(map(str,x[i])))
        f.write("\n")