# This notebook is here for preprocessing the data

In [49]:
import pandas as pd
import numpy as np

from numpy import random
from sklearn.preprocessing import OrdinalEncoder

random.seed(246)

In [50]:
df_train = pd.read_csv("smaller_train.csv")
df_valid = pd.read_csv("smaller_valid.csv")
df_test = pd.read_csv("smaller_test.csv")

df_train['s'] = 1
df_valid['s'] = 2
df_test['s'] = 3

df = pd.concat([df_train, df_valid, df_test], ignore_index=True)

In [51]:
df[df['transaction']==1]  # as one can see here

Unnamed: 0,transaction,platform,country_name,journey_id,channel_id,timestamp,timestamp_conversion,s
525,1,desktop,South Korea,130944,2,4236.409495,4239.122778,1
526,1,mobileWeb,Indonesia,130978,3,4302.195794,6452.081111,1
527,1,mobileWeb,United States,130990,3,2941.232592,2947.790278,1
528,1,mobileWeb,Germany,131006,6,2673.734090,2891.494722,1
529,1,mobileWeb,France,131046,6,4990.108078,4990.150000,1
...,...,...,...,...,...,...,...,...
22287,1,mobileWeb,Greece,134690,8,4176.732051,4561.416944,3
22288,1,mobileWeb,Germany,137848,1,5975.486344,6064.763889,3
22289,1,desktop,Germany,133728,3,4500.611033,5562.872778,3
22290,1,mobileWeb,Australia,132798,1,6170.508791,6174.803611,3


## Transform the transaction column s.t. only  last tp before conversioin has transaction == 1

In [52]:
df['time_diff'] = df['timestamp_conversion'] - df['timestamp'] #create new var for timedifference

In [53]:
df[df['time_diff'] < 0] #All observations where there's a touchpoint after conversion... 34 rows will be ignored

Unnamed: 0,transaction,platform,country_name,journey_id,channel_id,timestamp,timestamp_conversion,s,time_diff
10138,1,android,Poland,137811,11,4029.156188,4029.151944,1,-0.004243


In [54]:
df.drop(df[df.time_diff < 0].index, inplace = True) #remove these time_diff < 0 i.e. tp after transaction

In [55]:
df = df.sort_values('timestamp')
df = df.sort_values('journey_id')

In [56]:
# This transform df so that only last touchpoint before conversion gets transaction = 1. Because this leaves to few
# observations with transaction == 1 we don't consider it for now

#groups = df.groupby('journey_id').time_diff
#min_val = groups.transform(min) #search minimal time_diff in each group <=> closest tp to conversion

#cond1 = df.time_diff==min_val #define condition when transaction should be 1

#df['transaction'] = np.select([cond1], [1], default = 0) #transform transaction


## Long Journeys

In [57]:
journ_len = df.groupby("journey_id")["s"].count()
print(journ_len.describe())
p1 = 0.99
p2 = 0.9999
print('The ', p1*100, '% quantile is ', np.quantile(journ_len, p1))
print('The ', p2*100, '% quantile is ', np.quantile(journ_len, p2))

count    2035.000000
mean       10.953808
std        18.527553
min         1.000000
25%         2.000000
50%         4.000000
75%        11.000000
max       100.000000
Name: s, dtype: float64
The  99.0 % quantile is  98.0
The  99.99 % quantile is  100.0


In [58]:
max_journ_len = 16
df = df.groupby('journey_id').filter(lambda x: len(x) <= max_journ_len)

## How to handle object variables

###  Dummy variables for country, platform and channel, better but also huge data

In [59]:
df = pd.get_dummies(df, columns = ['channel_id'], prefix = 'channel', prefix_sep = '_', dtype = float)
df = pd.get_dummies(df, columns = ['country_name'], prefix = 'country', prefix_sep = '_', dtype = float)
df = pd.get_dummies(df, columns = ['platform'], prefix = 'platform', prefix_sep = '_', dtype = float)

### Ordinal Encoder, not really accurate, but doesn't blow up df

In [60]:
#ordinal_encoder = OrdinalEncoder()
#for column in df.columns:
#    if df[column].dtypes == 'object':
#        df[column] = ordinal_encoder.fit_transform(df[[column]])

## Split train / test

In [61]:
df_train = df[df['s'] == 1]
df_valid = df[df['s'] == 2]
df_test = df[df['s'] == 3]

## Remove irrelevant columns

In [62]:
df_train = df_train.drop(['s', 'timestamp_conversion', 'time_diff'], axis = 1) #cant be used for prediction
df_test = df_test.drop(['s', 'timestamp_conversion', 'time_diff'], axis = 1) #cant be used for prediction
df_valid = df_valid.drop(['s', 'timestamp_conversion', 'time_diff'], axis = 1) #cant be used for prediction

Put timestamp at the last position in the data set

In [63]:
df_train.insert(len(df_train.columns)-1, 'timestamp', df_train.pop('timestamp'))
df_test.insert(len(df_test.columns)-1, 'timestamp', df_test.pop('timestamp'))
df_valid.insert(len(df_valid.columns)-1, 'timestamp', df_valid.pop('timestamp'))


## Store as CSV

In [64]:
#df_train.to_csv('train.csv')
#print("training data is preprocessed and stored")
#df_valid.to_csv('valid.csv')
#print("valid data is preprocessed and stored")
#df_test.to_csv('test.csv')
#print("test data is preprocessed and stored")

training data is preprocessed and stored
valid data is preprocessed and stored
test data is preprocessed and stored
