In [1]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score, plot_roc_curve
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
plt.style.use('seaborn-whitegrid')

In [2]:
# read in the json files
offers = pd.read_json('data/portfolio.json', orient='records', lines=True)
customers = pd.read_json('data/profile.json', orient='records', lines=True)
events = pd.read_json('data/transcript.json', orient='records', lines=True)

# change column names and print df shape
offers.columns = ['reward','channels','min_spend','expire_days','offer_type','id']
customers.columns = ['gender','age','id','became_member_on','income']
events.columns = ['customer_id','event','value','hours_from_test']
print(f'offers has {len(offers)} rows')
print(f'customers has {len(customers)} rows')
print(f'events has {len(events)} rows')

offers has 10 rows
customers has 17000 rows
events has 306534 rows


In [3]:
# Clean the events data
for dic in events.value:
    if list(dic.keys()) == ['offer id']:
        dic['offer_id'] = dic.pop('offer id')

# Concat and transform the events data
temp = pd.DataFrame(events.value.to_list())
events = pd.concat([events,temp],axis=1)
events.rename(columns={'reward':'reward_received', 'amount':'amount_spent'},inplace=True)
events.drop('value',axis=1,inplace=True)
events.head()

Unnamed: 0,customer_id,event,hours_from_test,offer_id,amount_spent,reward_received
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,,
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,,


### 6 customers did not received offers

In [60]:
customer_funnel = events.groupby(['customer_id','event']).event.count().to_frame().rename({'event':'count'},axis=1).reset_index()
customer_funnel = pd.pivot(customer_funnel, index='customer_id', columns='event', values='count')
customer_funnel.head()

event,offer completed,offer received,offer viewed,transaction
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,8.0
00116118485d4dfda04fdbaba9a87b5c,,2.0,2.0,3.0
0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,5.0
0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,8.0
0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,12.0


In [61]:
# These six people did not get an offers during the test period
# Remove them from all the data
customer_funnel[customer_funnel['offer received'].isnull()]

event,offer completed,offer received,offer viewed,transaction
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12ede229379747bd8d74ccdc20097ca3,,,,3.0
3a4874d8f0ef42b9a1b72294902afea9,,,,3.0
ae8111e7e8cd4b60a8d35c42c1110555,,,,5.0
c6e579c6821c41d1a7a6a9cf936e91bb,,,,4.0
da7a7c0dcfcb41a8acc7864a53cf60fb,,,,1.0
eb540099db834cf59001f83a4561aef3,,,,4.0


In [67]:
deleted_customers = customer_funnel[customer_funnel['offer received'].isnull()].index
events = events[~events.customer_id.isin(deleted_customers)]
customers = customers[~customers.id.isin(deleted_customers)]

### Create a table to record the time a customer received first offer

In [68]:
first_offer_time = events.query("event == 'offer received'").groupby('customer_id').hours_from_test.min().to_frame().rename({'hours_from_test':'first_offer_time'},axis=1).reset_index()

def get_wave_ids(hours):
    '''
    input:
        hours - hours after start of the test to receive the first offer (0,168,336,408,504,576)
    output:
        ids - list of customer ids who got their first offers on this time
    
    '''
    ids = list(first_offer_time[first_offer_time.first_offer_time == hours].customer_id)
    return ids

# wave1_ids = get_wave_ids(0)
# wave2_ids = get_wave_ids(168)
# wave3_ids = get_wave_ids(336)
# wave4_ids = get_wave_ids(408)
# wave5_ids = get_wave_ids(504)
# wave6_ids = get_wave_ids(576)

# assert np.sum([len(wave1_ids), len(wave2_ids), len(wave3_ids), len(wave4_ids), len(wave5_ids), len(wave6_ids)]) + 6 == events.customer_id.nunique()

# Create a dataframe with events before customers received their first offer

In [69]:
# # Merge offers to events
# events_offers = pd.merge(events, offers, how='left', left_on='offer_id', right_on='id').drop('id',axis=1)
# events_offers.head()

In [70]:
time_of_waves = [0,168,336,408,504,576]

In [71]:
def create_pre_post_offer_events():

    '''
    input:
        None
    output:
        pre_offer_events - customer events before receiving the first offer
        post_offer_events - customer events after receiving the first offer
    '''

    pre_offer_events = pd.DataFrame()
    current = pd.DataFrame()

    for hour in time_of_waves:
        current = events[events.customer_id.isin(get_wave_ids(hour))]
        current = current[current.hours_from_test < hour]
        pre_offer_events = pd.concat([pre_offer_events, current])

    
    post_offer_events = pd.DataFrame()
    current = pd.DataFrame()

    for hour in time_of_waves:
        current = events[events.customer_id.isin(get_wave_ids(hour))]
        current = current[current.hours_from_test >= hour]
        post_offer_events = pd.concat([post_offer_events, current])


    # check the quality of data wrangling
    assert list(pre_offer_events.event.unique()) == ['transaction']
    assert list(post_offer_events.event.unique()) == ['offer received', 'offer viewed', 'transaction', 'offer completed']
    assert len(pre_offer_events) + len(post_offer_events) == len(events), [len(pre_offer_events) + len(post_offer_events), len(events)]

    return pre_offer_events, post_offer_events

In [73]:
pre_offer_events, post_offer_events = create_pre_post_offer_events()

In [75]:
# def get_customer_events(customer_id):

#     df = events_offers[events_offers.customer_id == customer_id][['customer_id','event','hours_from_test','amount_spent','reward_received','offer_type']]
#     return df

# get_customer_events(wave5_ids[0])

In [76]:
pre_offer_events

Unnamed: 0,customer_id,event,hours_from_test,offer_id,amount_spent,reward_received
12659,54890f68699049c2a04d415abc25e717,transaction,0,,13.23,
12849,098d124614df4e4b862d678160b98638,transaction,0,,1.21,
12853,98b81a8ffdd14bb986d80673491171a7,transaction,0,,7.40,
12914,ad18ad432f5a40c7b8ccce4be4fb530c,transaction,0,,17.00,
12994,2372a5dd677842478b5318e4ea525969,transaction,0,,12.88,
...,...,...,...,...,...,...
239184,9a3f45cf29ef428b932492c7a5d6ac52,transaction,552,,18.64,
240339,c77659a28bf84aaa95b1ca4bbb2b56d0,transaction,558,,2.42,
240818,3cc622f76e464bc1809a71e871579d6c,transaction,558,,9.62,
242709,01fe5ec668f241608eb2f7ec374cb1b7,transaction,564,,20.61,
