In [12]:
import numpy as np
import pandas as pd
import subprocess
import argparse

# Preprocessing RecSys 2017

For the RecSys 2017 dataset we first need to artificially create sessions out of the user internactions

In [13]:
def make_sessions(data, 
                  session_th=30 * 60, 
                  is_ordered=False, 
                  user_key='user_id', 
                  item_key='item_id', 
                  time_key='ts'):
    """Assigns session ids to the events in data without grouping keys"""
    if not is_ordered:
        # sort data by user and time
        data.sort_values(by=[user_key, time_key], ascending=True, inplace=True)
    # compute the time difference between queries
    tdiff = np.diff(data[time_key].values)
    # check which of them are bigger then session_th
    split_session = tdiff > session_th
    split_session = np.r_[True, split_session]
    # check when the user chenges is data
    new_user = data['user_id'].values[1:] != data['user_id'].values[:-1]
    new_user = np.r_[True, new_user]
    # a new sessions stars when at least one of the two conditions is verified
    new_session = np.logical_or(new_user, split_session)
    # compute the session ids
    session_ids = np.cumsum(new_session)
    data['session_id'] = session_ids
    return data

# Test set

A test set can be either created by (1) adding the last session of every user to be tested or, (2) making a time-based split.

In [14]:
def last_session_out_split(data,
                           user_key='user_id',
                           item_key='item_id',
                           session_key='session_id',
                           time_key='ts',
                           clean_test=True,
                           min_session_length=2):
    """
    last-session-out split
    assign the last session of every user to the test set and the remaining ones to the training set
    """
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.last()
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    if clean_test:
        train_items = train[item_key].unique()
        test = test[test[item_key].isin(train_items)]
        #  remove sessions in test shorter than min_session_length
        slen = test[session_key].value_counts()
        good_sessions = slen[slen >= min_session_length].index
        test = test[test[session_key].isin(good_sessions)].copy()
    return train, test

In [15]:
def last_n_days_out_split(data, n=1,
                          user_key='user_id',
                          item_key='item_id',
                          session_key='session_id',
                          time_key='ts',
                          clean_test=True,
                          min_session_length=2):
    """
    last n-days out split
    assign the sessions in the last n days to the test set and remaining to the training one
    """
    DAY = 24 * 60 * 60
    data.sort_values(by=[user_key, time_key], inplace=True)
    # start times of all sessions
    #sessions_start = data.groupby(session_key)[time_key].agg('min')
    # extract test start and end time
    end_time = data[time_key].max()
    test_start = end_time - n * DAY
    
    # get train and test indicies
    session_max_times = data.groupby(session_key)[time_key].max()
    session_train = session_max_times[session_max_times < test_start].index
    session_test = session_max_times[session_max_times >= test_start].index
    
    # in1d: Returns a boolean array the same length as ar1 that is True where 
    # an element of ar1 is in ar2 and False otherwise.
    train = data[
        np.in1d(
            data[session_key], 
            session_train
        )
    ].copy()
    test = data[
        np.in1d(
            data[session_key], 
            session_test
        )
    ].copy()

    #train = data[data.session_id.isin(sessions_start[sessions_start < test_start].index)].copy()
    #test = data[data.session_id.isin(sessions_start[sessions_start >= test_start].index)].copy()
    if clean_test:
        before_items = len(test[item_key].unique())
        # remove items which do not occur in the test set
        test = test[np.in1d(test[item_key], train[item_key])]
        after_items = len(test[item_key].unique())
        print("Before item count: " + str(before_items))
        print("After item count: " + str(after_items))
        
        #  remove sessions in test shorter than min_session_length
        
        tslength = test.groupby(session_key).size()
        test = test[
           np.in1d(
                test[session_key], 
                tslength[tslength >= min_session_length].index
            )
        ].copy()
    

    return train, test

#  1. RecSys17 processing

In [16]:
path =  "../../data/"
dataset = "recsys17/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

For the RecSys17 dataset, we:
* Remove the **delete recommendation** and **recruiter interest** interactions as these are not relevant in our setting
* **Discard** the **impression interaction** as these denote that XING showed
the corresponding job to a user. As stated by Bianchi, et al., 2017, the **presence of an impression does not imply** that the **user interacted with the job** and would thus **introduce bias** and possibly lead to learning a model that mimics XINGs recommender engine

Sessions are partitioned by a **30-minute** idle time

Keep all sessions: users with >= 2 sessions and also overly active ones (< 200,000 sessions)

In [21]:
interactions = pd.read_csv(raw_path + "interactions.csv", header=0, sep='\t')
print("Start Time: {}".format(pd.to_datetime(interactions["created_at"].min(), unit="s")))
print("Start Time: {}".format(pd.to_datetime(interactions["created_at"].max(), unit="s")))

# remove NaN values (should have only 1)
interactions = interactions[np.isfinite(interactions['created_at'])]
# convert back to long from float
interactions['created_at'] = interactions['created_at'].astype(np.int64)

# remove impressions
interactions = interactions[interactions.interaction_type >= 1].copy()
# remove delete and headhunter event types
interactions = interactions[interactions.interaction_type < 4].copy()


interactions['interaction_type'] = interactions['interaction_type'].fillna(0).astype('int')


print('Building sessions')
# partition interactions into sessions with 30-minutes idle time
interactions = make_sessions(interactions, session_th=30 * 60, time_key='created_at', is_ordered=False)


print(interactions.head(3))
# drop 189 duplicate interactions
interactions = interactions.drop_duplicates(['session_id','created_at'])

print('Original data:')
print('Num items: {}'.format(interactions.item_id.nunique()))
print('Num users: {}'.format(interactions.user_id.nunique()))
print('Num sessions: {}'.format(interactions.session_id.nunique()))

print('Filtering data')
# drop duplicate interactions within the same session
interactions.drop_duplicates(subset=['item_id', 'session_id', 'interaction_type'], keep='first', inplace=True)

# keep items with >=1 interactions
item_pop = interactions.item_id.value_counts()
good_items = item_pop[item_pop >= 1].index
inter_dense = interactions[interactions.item_id.isin(good_items)]

# remove sessions with length < 2
session_length = inter_dense.session_id.value_counts()
good_sessions = session_length[session_length >= 3].index
inter_dense = inter_dense[inter_dense.session_id.isin(good_sessions)]

# let's keep only returning users (with >= 5 sessions) and remove overly active ones (>=200 sessions)
sess_per_user = inter_dense.groupby('user_id')['session_id'].nunique()
good_users = sess_per_user[(sess_per_user >= 1) & (sess_per_user < 200000)].index
inter_dense = inter_dense[inter_dense.user_id.isin(good_users)]
print('Filtered data:')
print('Num items: {}'.format(inter_dense.item_id.nunique()))
print('Num users: {}'.format(inter_dense.user_id.nunique()))
print('Num sessions: {}'.format(inter_dense.session_id.nunique()))

inter_dense.to_csv(interim_path + "interactions.csv", sep='\t')

Start Time: 2016-11-06 09:19:02
Start Time: 2017-02-07 20:29:23
Building sessions
        user_id  item_id  interaction_type  created_at  session_id
25654        13   118310                 1  1484211749           1
630641       13  1875610                 1  1486027147           2
630640       13  1875610                 2  1486042146           3
Original data:
Num items: 51147
Num users: 249987
Num sessions: 456661
Filtering data
Filtered data:
Num items: 20135
Num users: 13369
Num sessions: 18284


# 2. Create train and test set by doing a time-based (2 weeks) split

In [22]:
print('Partitioning data')
# last-session-out partitioning
train_full_sessions, test_sessions = last_n_days_out_split(inter_dense, n=14,
                                                            user_key='user_id',
                                                            item_key='item_id',
                                                            session_key='session_id',
                                                            time_key='created_at',
                                                            clean_test=True)
train_valid_sessions, valid_sessions = last_n_days_out_split(train_full_sessions, n=14,
                                                              user_key='user_id',
                                                              item_key='item_id',
                                                              session_key='session_id',
                                                              time_key='created_at',
                                                              clean_test=True)

Partitioning data
Before item count: 6789
After item count: 2340
Before item count: 7605
After item count: 1758


In [23]:
# print statistics

train_len = len(train_full_sessions.session_id.unique())
train_item_len = len(train_full_sessions.item_id.unique())

test_len = len(test_sessions.session_id.unique())
test_item_len = len(test_sessions.item_id.unique())

merged_items = train_full_sessions.append(test_sessions, ignore_index=True)
merged_item_len = len(merged_items.item_id.unique())

print("Training - Sessions: " + str(train_len))
print("Testing - Sessions: " + str(test_len))
print("Train + Test - Sessions: " + str(train_len + test_len))

print("Training - Items: " + str(train_item_len))
print("Testing - Items: " + str(test_len))
print("Train + Test - Items: " + str(merged_item_len))


print("Train Validating - Sessions: " + str(len(train_valid_sessions.session_id.unique())))
print("Test Validating - Sessions: " + str(len(valid_sessions.session_id.unique())))

Training - Sessions: 12712
Testing - Sessions: 3610
Train + Test - Sessions: 16322
Training - Items: 15686
Testing - Items: 3610
Train + Test - Items: 15686
Train Validating - Sessions: 8001
Test Validating - Sessions: 2046


# 3. Store train and test sets

In [24]:
train_full_sessions.to_csv(processed_path + "train_14d.csv", sep='\t')
test_sessions.to_csv(processed_path + "test_14d.csv", sep='\t')
train_valid_sessions.to_csv(processed_path + "valid_train_14d.csv", sep='\t')
valid_sessions.to_csv(processed_path + "valid_test_14d.csv", sep='\t')

# 4. Create train and test session vectors

In [None]:
# Create vocabulary from train set
unqiue_train_items = train_full_sessions.item_id.unique()
# store (or load)
unqiue_train_items_df = pd.DataFrame(unqiue_train_items, columns=["item_id"])
print(len(unqiue_train_items_df))
unqiue_train_items_df.to_csv(interim_path + 'vocabulary.csv', header=True)
unqiue_train_items_df = pd.read_csv(interim_path + 'vocabulary.csv', index_col=0)


unqiue_train_items_dict = unqiue_train_items_df.to_dict('dict')["item_id"]
# inverse that item_id is key and index is value
unqiue_train_items_dict_inv = {v: k for k, v in unqiue_train_items_dict.items()}
print(unqiue_train_items_dict_inv[864950])

# session_vectors = []
session_vectors_np = []
session_groups = train_full_sessions.groupby("session_id")

print(str(len(session_groups)) + " sessions to encode.")
s_counter = 0      
for session_id, session_group in session_groups:
    # vector length = len(unqiue_train_items)
    session_vector = np.zeros((len(unqiue_train_items),), dtype=int)
    # fill 1s for session items
    for index, row in session_group.iterrows():
        item_index = unqiue_train_items_dict_inv[row["item_id"]]
        #item_index = unqiue_train_items.index(row["item_id"])
        # 1-hot encode
        session_vector[item_index] = 1
        #break
    # append session vector
#     session_vectors.append(session_vector)
    session_vectors_np.append(np.insert(session_vector, 0, s_counter))
    s_counter += 1
    if (s_counter % 10000 == 0):
        print(str(len(session_groups) - s_counter) + " sessions remaining to encode.")

# session_vector_df = pd.DataFrame(session_vectors)
# session_vector_df.head()

15686
0
12712 sessions to encode.


In [None]:
# session_vector_df.to_csv(interim_path + 'train_session_interaction_vector.csv', header=True)
a = np.vstack(session_vectors_np)
header = ",".join(map(str, range(len(unqiue_train_items))))
np.savetxt(interim_path + 'train_session_interaction_vector.csv', a, header=header, delimiter=",", fmt="%d", comments=",")
a

# Statistics

In [12]:
import matplotlib.pyplot as plt
interactions.interaction_type.value_counts().plot(kind='bar')
plt.show()

<Figure size 640x480 with 1 Axes>

In [13]:
print('Train Num items: {}'.format(train_full_sessions.item_id.nunique()))
print('Train Num sessions: {}'.format(train_full_sessions.session_id.nunique()))
print('Train Num events: {}'.format(len(train_full_sessions)))
print('Test Num items: {}'.format(test_sessions.item_id.nunique()))
print('Test Num sessions: {}'.format(test_sessions.session_id.nunique()))
print('Test Num events: {}'.format(len(test_sessions)))


Train Num items: 26722
Train Num sessions: 51698
Train Num events: 125302
Test Num items: 4401
Test Num sessions: 19711
Test Num events: 43122


In [14]:
interactions = pd.read_csv("../../data/recsys17/raw/interactions.csv", header=0, sep='\t')

# remove NaN values (should have only 1)
interactions = interactions[np.isfinite(interactions['created_at'])]
# convert back to long from float
interactions['created_at'] = interactions['created_at'].astype(np.int64)

# remove impressions
interactions = interactions[interactions.interaction_type >= 1].copy()
# remove delete and headhunter event types
interactions = interactions[interactions.interaction_type < 4].copy()


interactions['interaction_type'] = interactions['interaction_type'].fillna(0).astype('int')



print('Building sessions')
# partition interactions into sessions with 30-minutes idle time
interactions = make_sessions(interactions, session_th=30 * 60, time_key='created_at', is_ordered=False)


print(interactions.head(3))
# drop 189 duplicate interactions
interactions = interactions.drop_duplicates(['item_id','session_id','created_at'])

print('Original data:')
print('Num items: {}'.format(interactions.item_id.nunique()))
print('Num users: {}'.format(interactions.user_id.nunique()))
print('Num sessions: {}'.format(interactions.session_id.nunique()))

print('Filtering data')
# keep items with >=20 interactions
item_pop = interactions.item_id.value_counts()
good_items = item_pop[item_pop >= 1].index
inter_dense = interactions[interactions.item_id.isin(good_items)]
# remove sessions with length < 3
session_length = inter_dense.session_id.value_counts()
good_sessions = session_length[session_length >= 3].index
inter_dense = inter_dense[inter_dense.session_id.isin(good_sessions)]
# let's keep only returning users (with >= 5 sessions) and remove overly active ones (>=200 sessions)
sess_per_user = inter_dense.groupby('user_id')['session_id'].nunique()
good_users = sess_per_user[(sess_per_user >= 1) & (sess_per_user < 200000)].index
inter_dense = inter_dense[inter_dense.user_id.isin(good_users)]
print('Filtered data:')
print('Num items: {}'.format(inter_dense.item_id.nunique()))
print('Num users: {}'.format(inter_dense.user_id.nunique()))
print('Num sessions: {}'.format(inter_dense.session_id.nunique()))

store_path = "../../data/recsys17/"
inter_dense.to_csv(store_path + "filtered.csv", sep='\t')

print('Partitioning data')
# last-session-out partitioning
train_full_sessions, test_sessions = last_n_days_out_split(inter_dense, n=14,
                                                            user_key='user_id',
                                                            item_key='item_id',
                                                            session_key='session_id',
                                                            time_key='created_at',
                                                            clean_test=True)
train_valid_sessions, valid_sessions = last_n_days_out_split(train_full_sessions, n=14,
                                                              user_key='user_id',
                                                              item_key='item_id',
                                                              session_key='session_id',
                                                              time_key='created_at',
                                                              clean_test=True)

print("Data - Sessions: " + str(len(inter_dense.session_id.unique())))
print("Training - Sessions: " + str(len(train_full_sessions.session_id.unique())))
print("Testing - Sessions: " + str(len(test_sessions.session_id.unique())))
print("Train Validating - Sessions: " + str(len(train_valid_sessions.session_id.unique())))
print("Test Validating - Sessions: " + str(len(valid_sessions.session_id.unique())))

train_full_sessions.to_csv(store_path + "train_d14.csv", sep='\t')
test_sessions.to_csv(store_path + "test_d14.csv", sep='\t')
train_valid_sessions.to_csv(store_path + "valid_train_d14.csv", sep='\t')
valid_sessions.to_csv(store_path + "valid_test_d14.csv", sep='\t')

print('Train Num items: {}'.format(train_full_sessions.item_id.nunique()))
print('Train Num sessions: {}'.format(train_full_sessions.session_id.nunique()))
print('Train Num events: {}'.format(len(train_full_sessions)))
print('Test Num items: {}'.format(test_sessions.item_id.nunique()))
print('Test Num sessions: {}'.format(test_sessions.session_id.nunique()))
print('Test Num events: {}'.format(len(test_sessions)))

Building sessions
        user_id  item_id  interaction_type  created_at  session_id
25654        13   118310                 1  1484211749           1
630641       13  1875610                 1  1486027147           2
630640       13  1875610                 2  1486042146           3
Original data:
Num items: 51181
Num users: 249987
Num sessions: 456661
Filtering data
Filtered data:
Num items: 23323
Num users: 21362
Num sessions: 28753
Partitioning data
Before item count: 8141
After item count: 3048
Before item count: 8990
After item count: 2316
Data - Sessions: 28753
Training - Sessions: 20075
Testing - Sessions: 5257
Train Validating - Sessions: 11994
Test Validating - Sessions: 3860
Train Num items: 18230
Train Num sessions: 20075
Train Num events: 78666
Test Num items: 2755
Test Num sessions: 5257
Test Num events: 14817
