In [40]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## 1.load data

In [41]:
events = pd.read_csv('events.csv')

In [None]:
sessions = pd.read_csv('sessions.csv')

### 1.1 split data into train and test

In [None]:
# convert timestamp into day of year
def timeconvert(ts):
    ts = int(ts)/1000
    return (datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))

events['time'] = events['event_timestamp'].apply(timeconvert)
events['time'] = pd.to_datetime(events['time']) 
events['day_current'] = events['time'].dt.dayofyear 

In [None]:
sessions['time'] = sessions['start_timestamp'].apply(timeconvert)
sessions['time'] = pd.to_datetime(sessions['time']) 
sessions['day_current'] = sessions['time'].dt.dayofyear

In [None]:
# split the data into train(before Dec) and test(whole dataset)
events_train = events.loc[events.day_current <= 334]
sessions_train = sessions.loc[sessions.day_current <= 334]

## 2.Feature Engineering
### 2.1 Events

In [None]:
# purchase in the past 20 days
# train set
purchase_before_train = events_train.loc[events_train.event == "8"]
purchase_before_train = purchase_before_train.loc[purchase_before_train.day_current > 314]
purchase_before_train = purchase_before_train.groupby("user_id_hash")['event'].count().to_frame()

# test set
purchase_before = events.loc[events.event == "8"]
purchase_before = purchase_before.loc[purchase_before.day_current > 328]
purchase_before = purchase_before.groupby("user_id_hash")['event'].count().to_frame()

In [None]:
# total purchase value in the past
value_purchase_train = events_train.loc[events_train.event == "8"].groupby('user_id_hash')['event_value'].sum().to_frame()
value_purchase = events.loc[events.event == "8"].groupby('user_id_hash')['event_value'].sum().to_frame()

### 2.2 Sessions

In [None]:
# load label.csv (run label_extract.py first)
label = pd.read_csv('label.csv')

In [None]:
# merge sessions with label
sessions = pd.merge(sessions, label, on ='user_id_hash', how = "left")

In [None]:
events_train = pd.merge(events_train, label, on ='user_id_hash', how = "left")

In [None]:
# purchase percent of each country
user_per_country = sessions.groupby("country")['user_id_hash'].nunique()
purchase_per_country = sessions.loc[sessions.user_purchase_binary_14_days == 1].groupby("country")['user_id_hash'].nunique()
pct_country = purchase_per_country/user_per_country
pct_country = pct_country.to_frame()
pct_country.columns = ['pct_country']
pct_country = pct_country.fillna(0)
sessions = pd.merge(sessions, pct_country, on ='country', how = "left")

pct_country = sessions.groupby("user_id_hash")['pct_country'].max().to_frame()
pct_country = pct_country.reset_index()
pct_country.columns = ['user_id_hash','pct_country']

In [None]:
# purchase percent of each city
user_per_city = sessions.groupby("city")['user_id_hash'].nunique()
purchase_per_city = sessions.loc[sessions.user_purchase_binary_14_days == 1].groupby("city")['user_id_hash'].nunique()
pct_city = purchase_per_city/user_per_city
pct_city= pct_city.to_frame()
pct_city.columns = ['pct_city']
pct_city = pct_city.fillna(0)
sessions = pd.merge(sessions, pct_city, on ='city', how = "left")

pct_city = sessions.groupby("user_id_hash")['pct_city'].max().to_frame()
pct_city = pct_city.reset_index()
pct_city.columns = ['user_id_hash','pct_city']

In [None]:
# mean sessions_duration
sessions_duration_train = sessions_train.groupby('user_id_hash')['previous_sessions_duration'].mean().to_frame()
sessions_duration = sessions.groupby('user_id_hash')['previous_sessions_duration'].mean().to_frame()
# normalize
sessions_duration_train = ((sessions_duration_train.previous_sessions_duration 
                            - sessions_duration_train.previous_sessions_duration.min())
                           /(sessions_duration_train.previous_sessions_duration.max() 
                             - sessions_duration_train.previous_sessions_duration.min())).to_frame()
sessions_duration = ((sessions_duration.previous_sessions_duration - sessions_duration.previous_sessions_duration.min())/(sessions_duration.previous_sessions_duration.max() - sessions_duration.previous_sessions_duration.min())).to_frame()

In [None]:
# Number of unique sessions of each user
session_unique_train = events_train.groupby("user_id_hash")["session_id"].nunique().to_frame()
session_unique = events.groupby("user_id_hash")["session_id"].nunique().to_frame()

### 2.3 Attributes

In [None]:
# total number of each attribute of each user
attributes = pd.read_csv('attributes.csv',error_bad_lines=False)
attribute = attributes.groupby(['user_id_hash','attribute'])['session_id'].count()
attribute = attribute.to_frame()
attribute=attribute.reset_index()
attribute = attribute.pivot(index='user_id_hash',columns='attribute',values='session_id')

In [None]:
attribute.head()

counts for most attributes are the same, Thus, we only take attribute 0, 66, 67

In [None]:
attribute = attribute[[0,66,67]]

## 3. Merge features together with label

In [None]:
#label_f= pd.merge(label, session_pct, on = "user_id_hash", how = "left")
feature_train = pd.merge(label, session_unique_train, on = "user_id_hash", how = "left")
feature_train = pd.merge(feature_train, purchase_before_train, on = "user_id_hash", how = "left")
feature_train = pd.merge(feature_train,value_purchase_train, on = "user_id_hash", how = "left")
feature_train = pd.merge(feature_train,sessions_duration_train, on = "user_id_hash", how = "left")
feature_train = pd.merge(feature_train,pct_country, on = "user_id_hash", how = "left")
feature_train = pd.merge(feature_train,pct_city, on = "user_id_hash", how = "left")
feature_train = pd.merge(feature_train,attribute,on="user_id_hash",how = "left")
feature_train = feature_train.fillna(0)
feature_train.columns = ["user_id_hash", "user_purchase_binary_7_days", "user_purchase_binary_14_days",
                  "unique_session","purchase_before","value_purchase","sessions_duration",'pct_country','pct_city','0','66','67']

In [None]:
# load example submission 
res = pd.read_csv("sample_submission_2.csv")

In [None]:
#label_f= pd.merge(label, session_pct, on = "user_id_hash", how = "left")
feature_test = pd.merge(res, session_unique_train, on = "user_id_hash", how = "left")
feature_test = pd.merge(feature_test, purchase_before_train, on = "user_id_hash", how = "left")
feature_test = pd.merge(feature_test,value_purchase_train, on = "user_id_hash", how = "left")
feature_test = pd.merge(feature_test,sessions_duration_train, on = "user_id_hash", how = "left")
feature_test = pd.merge(feature_test,pct_country, on = "user_id_hash", how = "left")
feature_test = pd.merge(feature_test,pct_city, on = "user_id_hash", how = "left")
feature_test = pd.merge(feature_test,attribute,on="user_id_hash",how = "left")
feature_test = feature_test.fillna(0)
feature_test.columns = ["user_id_hash", "user_purchase_binary_7_days", "user_purchase_binary_14_days",
                  "unique_session","purchase_before","value_purchase","sessions_duration",'pct_country','pct_city','0','66','67']

In [None]:
feature_train.head()

In [None]:
# load the features to disk
feature_train.to_csv("feature_train_org.csv",index=False)
feature_test.to_csv("feature_test_org.csv",index=False)