In [5]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

import tensorflow as tf

In [6]:
df = pd.read_csv("data_sample1.csv")

In [7]:
#df.describe()
df = df[df['journey_id'].isin(range(2000))]  # only temporary to have less data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1290 entries, 0 to 1676009
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   transaction           1290 non-null   int64  
 1   platform              1290 non-null   object 
 2   country_name          1290 non-null   object 
 3   journey_id            1290 non-null   int64  
 4   channel_id            1290 non-null   int64  
 5   timestamp             1290 non-null   float64
 6   timestamp_conversion  0 non-null      float64
 7   s                     1290 non-null   int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 90.7+ KB


## Transform the transaction column s.t. only  last tp before conversioin has transaction == 1

In [8]:
df['time_diff'] = df['timestamp_conversion'] - df['timestamp'] #create new var for timedifference

In [9]:
df[df['time_diff'] < 0] #All observations where there's a touchpoint after conversion... 34 rows will be ignored

Unnamed: 0,transaction,platform,country_name,journey_id,channel_id,timestamp,timestamp_conversion,s,time_diff


In [10]:
df.drop(df[df.time_diff < 0].index, inplace = True) #remove these time_diff < 0 i.e. tp after transaction

In [11]:
df = df.sort_values('timestamp')
df = df.sort_values('journey_id')

In [12]:
groups = df.groupby('journey_id').time_diff
min_val = groups.transform(min) #search minimal time_diff in each group <=> closest tp to conversion

cond1 = df.time_diff==min_val #define condition when transaction should be 1

df['transaction'] = np.select([cond1], [1], default = 0) #transform transaction


## Long Journeys

In [13]:
journ_len = df.groupby("journey_id")["s"].count()
print(journ_len.describe())
p1 = 0.99
p2 = 0.9999
print('The ', p1*100, '% quantile is ', np.quantile(journ_len, p1))
print('The ', p2*100, '% quantile is ', np.quantile(journ_len, p2))

count     99.000000
mean      13.030303
std       48.932159
min        1.000000
25%        1.000000
50%        3.000000
75%        8.500000
max      414.000000
Name: s, dtype: float64
The  99.0 % quantile is  265.0399999999994
The  99.99 % quantile is  412.51040000000023


In [14]:
max_journ_len = 16
df = df.groupby('journey_id').filter(lambda x: len(x) <= max_journ_len)

## Remove irrelevant columns

In [15]:
df = df.drop(['s', 'timestamp_conversion', 'time_diff'], axis = 1) #cant be used for prediction

## How to handle object variables

###  Dummy variables for country, platform and channel, better but also huge data

In [16]:
df = pd.get_dummies(df, columns = ['channel_id'], prefix = 'channel', prefix_sep = '_', dtype = float)
df = pd.get_dummies(df, columns = ['country_name'], prefix = 'country', prefix_sep = '_', dtype = float)
df = pd.get_dummies(df, columns = ['platform'], prefix = 'platform', prefix_sep = '_', dtype = float)

### Ordinal Encoder, not really accurate, but doesn't blow up df

In [17]:
#ordinal_encoder = OrdinalEncoder()
#for column in df.columns:
#    if df[column].dtypes == 'object':
#        df[column] = ordinal_encoder.fit_transform(df[[column]])

In [18]:
df.describe()

Unnamed: 0,transaction,journey_id,timestamp,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,...,country_Switzerland,country_Thailand,country_Turkey,country_Ukraine,country_United Kingdom,country_United States,country_Vietnam,platform_android,platform_desktop,platform_mobileWeb
count,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,...,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0,394.0
mean,0.0,879.124365,4039.037559,0.553299,0.081218,0.119289,0.045685,0.091371,0.010152,0.038071,...,0.025381,0.005076,0.01269,0.020305,0.06599,0.121827,0.022843,0.053299,0.385787,0.560914
std,0.0,617.352009,1492.090242,0.497783,0.273517,0.324541,0.209067,0.288502,0.100373,0.191611,...,0.157478,0.071156,0.112077,0.141219,0.24858,0.327502,0.149592,0.224916,0.4874,0.496907
min,0.0,7.0,47.649824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,361.5,3039.233737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,797.5,3844.142053,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,1493.0,5375.00331,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,0.0,1965.0,6357.719382,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Next step: transform to tensor

In [49]:
def mta2tensor(df):
    df_transaction = df['transaction']
    data = df.drop('transaction', axis=1)
    grous = df.groupby('journey_id')
    x = []
    y = []

    for i in df['journey_id'].unique():
        x1 = grous.get_group(i)

        x1 = x1.drop(['journey_id'], axis=1)
        x1 = x1.values.tolist()

        y_prop = df_transaction.loc[grous.get_group(i).index]
        y_prop = y_prop.values.tolist()

        y1 = []
        for l in range(len(y_prop)):
            y1.append([y_prop[l]])

        for j in range(max_journ_len - len(x1)):
            x1.append([0] * 52)  # 52 is number of columns without journey_id an transaction
            y1.append([0])
        x.append(x1)
        y.append(y1)


    return tf.convert_to_tensor(x), tf.convert_to_tensor(y)


In [48]:
x, y = mta2tensor(df)


[[0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0], [0], [0], [0]]
[[0]]
[[0], [0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0], [0], [0], [0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0], [0], [0], [0]]
[[0], [0], [0], [0], [0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0], [0], [0], [0]]
[[0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0], [0], [0], [0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0]]
[[0], [0]]
[[0]]
[[0], [0]]
[[0], [0], [0]]
[[0], [0], [0], [0]]
[[0], [0], [0], 