In [1]:
#import necessary library
import numpy as np # linear algebra
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.set_style('whitegrid') # set style for visualization
import warnings # ignore warnings
warnings.filterwarnings('ignore')

from initial_report import *

In [2]:
#import customer data
df_customer=pd.read_csv("cleaned_customer_data.csv")
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0


In [3]:
#get initial report
initial_report(df_customer)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 14825
- Total Columns: 5
- Column Names: ['customer_id', 'became_member_on', 'gender', 'age', 'income']

*** Data Types:
  customer_id: object
  became_member_on: object
  gender: object
  age: int64
  income: float64

*** Mixed Data Types:

*** Distinct Values per Column:
  customer_id: 14825
  became_member_on: 1707
  gender: 3
  age: 84
  income: 91

*** Null Values and Percentages:


*** Duplicates: 0

*** Negative or Zero Values:

*** Basic Statistics:
                age         income
count  14825.000000   14825.000000
mean      54.393524   65404.991568
std       17.383705   21598.299410
min       18.000000   30000.000000
25%       42.000000   49000.000000
50%       55.000000   64000.000000
75%       66.000000   80000.000000
max      101.000000  120000.000000

*** Category Description:
                             customer_id became_member_on gender
count                       

Problems:
1. became member on column is object type- [change to datetype]

In [4]:
# Convert 'became_member_on' to datetime
df_customer['became_member_on'] = pd.to_datetime(df_customer['became_member_on'])
print(df_customer['became_member_on'].dtype)

datetime64[ns]


In [5]:
# Create age group column
df_customer['age_group'] = np.select(
    [
        (df_customer['age'] >= 18) & (df_customer['age'] <= 34),
        (df_customer['age'] >= 35) & (df_customer['age'] <= 49),
        (df_customer['age'] >= 50) & (df_customer['age'] <= 64),
        (df_customer['age'] >= 65) & (df_customer['age'] <= 79),
        (df_customer['age'] >= 80) & (df_customer['age'] <= 110)
    ],
    [
        'Young Adult',
        'Middle Age Adult',
        'Older Adult',
        'Senior',
        'Elderly'
    ],
    default='Unknown'
)

#print
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0,Older Adult
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0,Senior
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0,Senior
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0,Senior
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0,Older Adult


In [6]:
#get value counts for age group
df_customer.age_group.value_counts()

age_group
Older Adult         5150
Senior              3164
Middle Age Adult    3153
Young Adult         2256
Elderly             1102
Name: count, dtype: int64

In [7]:
#create income group

bins = [0, 44000, 84000, float('inf')]
labels = ['Low Income', 'Middle Income', 'High Income']

df_customer['income_group'] = pd.cut(df_customer['income'], bins=bins, labels=labels, right=True)

#check dataframe
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0,Older Adult,High Income
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0,Senior,High Income
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0,Senior,Middle Income
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0,Senior,Middle Income
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0,Older Adult,Middle Income


In [8]:
#get value counts for income group
df_customer.income_group.value_counts()

income_group
Middle Income    8941
High Income      3015
Low Income       2869
Name: count, dtype: int64

In [9]:
#get gender percentage
gender_percent = (
    df_customer.groupby('gender').size() / len(df_customer) * 100
)
gender_percent=round(gender_percent,2)
print(gender_percent)


gender
F    41.34
M    57.23
O     1.43
dtype: float64


In [10]:
#create group for membership year
df_customer['membership_year'] = df_customer['became_member_on'].astype(str).str[:4].astype(int)
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group,membership_year
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0,Older Adult,High Income,2017
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0,Senior,High Income,2017
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0,Senior,Middle Income,2018
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0,Senior,Middle Income,2018
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0,Older Adult,Middle Income,2017


In [11]:
#get value counts for membership year
df_customer.membership_year.value_counts()

membership_year
2017    5599
2018    3669
2016    3024
2015    1597
2014     662
2013     274
Name: count, dtype: int64

In [12]:
#drop unnecessary columns
df_customer_bins = df_customer.drop(columns=['became_member_on', 'age','income'])
df_customer_bins.head()

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017
1,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017
2,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018
3,389bc3fa690240e798340f5a15918d5c,M,Senior,Middle Income,2018
4,2eeac8d8feae4a8cad5a6af0499a211d,M,Older Adult,Middle Income,2017


In [13]:
#import event data
df_event=pd.read_csv("cleaned_events.csv")
df_event.head()

Unnamed: 0,customer_id,event,time,offer_id,amount,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,,
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,,


In [14]:
#merge df_customer_bins and df_event
df_customer_events = pd.merge(df_customer_bins, df_event, on='customer_id', how='left')
df_customer_events.head()

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,18,,21.51,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,144,,32.28,
2,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
3,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer received,504,3f207df678b143eea3cee63160fa8bed,,
4,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,


In [15]:
initial_report(df_customer_events)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 272388
- Total Columns: 10
- Column Names: ['customer_id', 'gender', 'age_group', 'income_group', 'membership_year', 'event', 'time', 'offer_id', 'amount', 'reward']

*** Data Types:
  customer_id: object
  gender: object
  age_group: object
  income_group: category
  membership_year: int64
  event: object
  time: int64
  offer_id: object
  amount: float64
  reward: float64

*** Mixed Data Types:
  offer_id:
    - str: 148431
    - float: 123957

*** Distinct Values per Column:
  customer_id: 14825
  gender: 3
  age_group: 5
  income_group: 3
  membership_year: 6
  event: 4
  time: 120
  offer_id: 10
  amount: 5036
  reward: 4

*** Null Values and Percentages:
  offer_id: Missing Values: 123957, Pct: 45.508%
  amount: Missing Values: 148431, Pct: 54.492%
  reward: Missing Values: 240318, Pct: 88.226%


*** Duplicates: 0

*** Negative or Zero Values:
  time: 13545

*** Basic Statistics

In [16]:
df_customer_events.event.value_counts()

event
transaction        123957
offer received      66501
offer viewed        49860
offer completed     32070
Name: count, dtype: int64

In [17]:
#percentage of offer_viewed and completed

# Calculate the value counts and get the total sum
event_counts = df_customer_events['event'].value_counts()

# Get the count for 'offer completed' and 'offer viewed'
offer_received_count = event_counts.get('offer received', 0)
offer_completed_count = event_counts.get('offer completed', 0)
offer_viewed_count = event_counts.get('offer viewed', 0)

# Calculate the percentages
offer_completed_percentage = (offer_completed_count / offer_received_count) * 100
offer_viewed_percentage = (offer_viewed_count / offer_received_count) * 100
offer_completed_after_view=(offer_completed_count / offer_viewed_count) * 100

# Print the results
print(f"Offer Viewed after offer received: {offer_viewed_percentage:.2f}%")
print(f"Offer Completed after offer received: {offer_completed_percentage:.2f}%")
print(f"Offer Completed after offer viewed: {offer_completed_after_view:.2f}%")


Offer Viewed after offer received: 74.98%
Offer Completed after offer received: 48.22%
Offer Completed after offer viewed: 64.32%


In [18]:
#total_transaction_amount
total_transaction_amount = df_customer_events['amount'].sum()
print(total_transaction_amount)

1734942.4


In [19]:
# Average transaction_amount
avg_transaction_amount = df_customer_events['amount'].mean()
print(avg_transaction_amount)


13.996324531894123


In [20]:
#get average spend per customer
distinct_customers_count = df_customer_events['customer_id'].nunique()
#get average
avg_spend_per_customer=total_transaction_amount/distinct_customers_count
print(avg_spend_per_customer)


117.02815514333895


In [21]:
#total spend based on age group
total_spend_age_group=df_customer_events.groupby("age_group")["amount"].sum().sort_values(ascending=False)
print(total_spend_age_group)

age_group
Older Adult         652728.13
Senior              401065.92
Middle Age Adult    343178.68
Young Adult         190943.30
Elderly             147026.37
Name: amount, dtype: float64


In [22]:
#average spend by age group
age_group_count= df_customer_events.groupby("age_group")["customer_id"].nunique()
avg_spend_age_group=total_spend_age_group/age_group_count.sort_values(ascending=False)
print(avg_spend_age_group)

age_group
Older Adult         126.743326
Senior              126.759140
Middle Age Adult    108.841954
Young Adult          84.637988
Elderly             133.417759
dtype: float64


In [23]:
#no. of transaction based on age_group
df_transaction=df_customer_events[df_customer_events.event=="transaction"]
number_of_trans_age_group=df_transaction.groupby("age_group")["event"].size().sort_values(ascending=False)
print(number_of_trans_age_group)

age_group
Older Adult         39988
Middle Age Adult    27618
Senior              24565
Young Adult         23183
Elderly              8603
Name: event, dtype: int64


In [24]:
#average transaction amount by age group
avg_transaction_amount_age_group=df_customer_events.groupby("age_group")["amount"].mean()
print(avg_transaction_amount_age_group)

age_group
Elderly             17.090128
Middle Age Adult    12.425906
Older Adult         16.323100
Senior              16.326722
Young Adult          8.236350
Name: amount, dtype: float64


In [25]:
#total spend based on membership year
total_spend_mem_year=df_customer_events.groupby("membership_year")["amount"].sum().sort_values(ascending=False)
print(total_spend_mem_year)

membership_year
2017    658287.36
2016    519551.67
2015    245725.55
2018    232690.24
2014     55311.60
2013     23375.98
Name: amount, dtype: float64


In [26]:
#total transaction based on income_group
total_spend_income_group=df_customer_events.groupby("income_group")["amount"].sum().sort_values(ascending=False)
print(total_spend_income_group)

income_group
Middle Income    1003664.00
High Income       547875.48
Low Income        183402.92
Name: amount, dtype: float64


In [27]:
#average transaciton amount by income group
income_group_count= df_customer_events.groupby("income_group")["customer_id"].nunique()
avg_spend_income_group=total_spend_income_group/income_group_count.sort_values(ascending=False)
print(avg_spend_income_group)

income_group
Middle Income    112.254110
High Income      181.716577
Low Income        63.925730
dtype: float64


In [28]:
#no. of transaction based on income_group
number_of_trans_income_group=df_transaction.groupby("income_group")["event"].size().sort_values(ascending=False)
print(number_of_trans_income_group)

income_group
Middle Income    76169
Low Income       28905
High Income      18883
Name: event, dtype: int64


In [29]:
#average transaction value by income group
avg_transaction_value_income_group=df_customer_events.groupby("income_group")["amount"].mean()
print(avg_transaction_value_income_group)

income_group
Low Income        6.345024
Middle Income    13.176804
High Income      29.014218
Name: amount, dtype: float64


In [30]:
#total spending by gender
total_transaction_by_gender=df_customer_events.groupby("gender")["amount"].sum()
print(total_transaction_by_gender)

gender
F    863695.00
M    844890.86
O     26356.54
Name: amount, dtype: float64


In [31]:
#average transaciton amount by gender
gender_count= df_customer_events.groupby("gender")["customer_id"].nunique()
avg_trans_gender=total_transaction_by_gender/gender_count.sort_values(ascending=False)
print(avg_trans_gender)

gender
F    140.919400
M     99.586381
O    124.323302
dtype: float64


In [32]:
#no. of transaction based on gender
number_of_trans_gender=df_transaction.groupby("gender")["event"].size().sort_values(ascending=False)
print(number_of_trans_gender)

gender
M    72794
F    49382
O     1781
Name: event, dtype: int64


In [33]:
#average transaction value by gender
avg_transaction_value_gender=df_customer_events.groupby("gender")["amount"].mean()
print(avg_transaction_value_gender)

gender
F    17.490077
M    11.606600
O    14.798731
Name: amount, dtype: float64


In [34]:
df_customer_events.head()

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,18,,21.51,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,144,,32.28,
2,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
3,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer received,504,3f207df678b143eea3cee63160fa8bed,,
4,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,


In [35]:
#sort by customer_id and time

df_sorted = df_customer_events.sort_values(by=["customer_id","offer_id", "time"], ascending=[True, True, True])

# Reset index for cleanliness
df_sorted = df_sorted.reset_index(drop=True)

df_sorted.head()


Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
0,0009655768c64bdeb2e877511632db8f,M,Young Adult,Middle Income,2017,offer received,576,2906b810c7d4411798c6938adc9daaa5,,
1,0009655768c64bdeb2e877511632db8f,M,Young Adult,Middle Income,2017,offer completed,576,2906b810c7d4411798c6938adc9daaa5,,2.0
2,0009655768c64bdeb2e877511632db8f,M,Young Adult,Middle Income,2017,offer received,336,3f207df678b143eea3cee63160fa8bed,,
3,0009655768c64bdeb2e877511632db8f,M,Young Adult,Middle Income,2017,offer viewed,372,3f207df678b143eea3cee63160fa8bed,,
4,0009655768c64bdeb2e877511632db8f,M,Young Adult,Middle Income,2017,offer received,168,5a8bc65990b245e5a138643cd4eb9837,,


In [36]:
#check status from one customer_id
df_sorted[df_sorted.customer_id=="e2127556f4f64592b11af22de27a7932"]

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
240532,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer received,0,2906b810c7d4411798c6938adc9daaa5,,
240533,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer viewed,18,2906b810c7d4411798c6938adc9daaa5,,
240534,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer received,336,3f207df678b143eea3cee63160fa8bed,,
240535,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
240536,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer viewed,420,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
240537,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer completed,522,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0
240538,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer received,504,fafdcd668e3743c1bb461111dcafc2a4,,
240539,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer viewed,522,fafdcd668e3743c1bb461111dcafc2a4,,
240540,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,offer completed,522,fafdcd668e3743c1bb461111dcafc2a4,,2.0
240541,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018,transaction,288,,17.88,


We can see that at time 522, one transaction happen and two offer_completed. So, it not possible to associate for what offer, the transaction was actually made. One person can receive multiple offers without even vieweing or completing the first offer. Transactions are not associated with offer_id. For a one single time, one transaction and multiple offer completion can occur. Which actually made the dataset complex to link any offer 
completion with transaction.

In [37]:
#get df_received
df_received=df_sorted[df_sorted.event=="offer received"]
df_received.rename(columns={'time': 'rcv_time'}, inplace=True)
df_received = df_received[['customer_id', 'rcv_time', 'offer_id']]
print(df_received.shape)
df_received.head()

(66501, 3)


Unnamed: 0,customer_id,rcv_time,offer_id
0,0009655768c64bdeb2e877511632db8f,576,2906b810c7d4411798c6938adc9daaa5
2,0009655768c64bdeb2e877511632db8f,336,3f207df678b143eea3cee63160fa8bed
4,0009655768c64bdeb2e877511632db8f,168,5a8bc65990b245e5a138643cd4eb9837
6,0009655768c64bdeb2e877511632db8f,408,f19421c1d4aa40978ebb69ca19b0e20d
9,0009655768c64bdeb2e877511632db8f,504,fafdcd668e3743c1bb461111dcafc2a4


In [38]:
#get df_viewed
df_viewed=df_sorted[df_sorted.event=="offer viewed"]
df_viewed.rename(columns={'time': 'view_time'}, inplace=True)
df_viewed = df_viewed[['customer_id', 'view_time', 'offer_id']]
print(df_viewed.shape)
df_viewed.head()

(49860, 3)


Unnamed: 0,customer_id,view_time,offer_id
3,0009655768c64bdeb2e877511632db8f,372,3f207df678b143eea3cee63160fa8bed
5,0009655768c64bdeb2e877511632db8f,192,5a8bc65990b245e5a138643cd4eb9837
8,0009655768c64bdeb2e877511632db8f,456,f19421c1d4aa40978ebb69ca19b0e20d
11,0009655768c64bdeb2e877511632db8f,540,fafdcd668e3743c1bb461111dcafc2a4
21,0011e0d4e6b944f998e987f904e8c1e5,432,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [39]:
#merge df_received and viewed
df_rcv_view=pd.merge(df_received,df_viewed, on=["customer_id", "offer_id"],how="left")
print(df_rcv_view.shape)
print(df_rcv_view.duplicated().sum()) 
df_rcv_view.head()

(82834, 4)
0


Unnamed: 0,customer_id,rcv_time,offer_id,view_time
0,0009655768c64bdeb2e877511632db8f,576,2906b810c7d4411798c6938adc9daaa5,
1,0009655768c64bdeb2e877511632db8f,336,3f207df678b143eea3cee63160fa8bed,372.0
2,0009655768c64bdeb2e877511632db8f,168,5a8bc65990b245e5a138643cd4eb9837,192.0
3,0009655768c64bdeb2e877511632db8f,408,f19421c1d4aa40978ebb69ca19b0e20d,456.0
4,0009655768c64bdeb2e877511632db8f,504,fafdcd668e3743c1bb461111dcafc2a4,540.0


In [40]:
# can one customer can rcv and view same offer at multiple times?
df_rcv_view[df_rcv_view.duplicated(subset=['customer_id', 'offer_id'], keep=False)]

Unnamed: 0,customer_id,rcv_time,offer_id,view_time
13,0020c2b971eb4e9188eac86d93036a77,0,fafdcd668e3743c1bb461111dcafc2a4,12.0
14,0020c2b971eb4e9188eac86d93036a77,336,fafdcd668e3743c1bb461111dcafc2a4,12.0
22,003d66b6608740288d6cc97a6903f4f0,168,fafdcd668e3743c1bb461111dcafc2a4,300.0
23,003d66b6608740288d6cc97a6903f4f0,168,fafdcd668e3743c1bb461111dcafc2a4,420.0
24,003d66b6608740288d6cc97a6903f4f0,408,fafdcd668e3743c1bb461111dcafc2a4,300.0
...,...,...,...,...
82827,ffff82501cea40309d5fdd7edcca4a07,408,2906b810c7d4411798c6938adc9daaa5,414.0
82828,ffff82501cea40309d5fdd7edcca4a07,408,2906b810c7d4411798c6938adc9daaa5,582.0
82829,ffff82501cea40309d5fdd7edcca4a07,576,2906b810c7d4411798c6938adc9daaa5,354.0
82830,ffff82501cea40309d5fdd7edcca4a07,576,2906b810c7d4411798c6938adc9daaa5,414.0


One customer id can receive and view same offers at multiple times

In [41]:
#get df_completed
df_completed=df_sorted[df_sorted.event=="offer completed"]
df_completed.rename(columns={'time': 'comp_time'}, inplace=True)
df_completed = df_completed[['customer_id', 'comp_time', 'offer_id']]
print(df_completed.shape)
df_completed.head()

(32070, 3)


Unnamed: 0,customer_id,comp_time,offer_id
1,0009655768c64bdeb2e877511632db8f,576,2906b810c7d4411798c6938adc9daaa5
7,0009655768c64bdeb2e877511632db8f,414,f19421c1d4aa40978ebb69ca19b0e20d
10,0009655768c64bdeb2e877511632db8f,528,fafdcd668e3743c1bb461111dcafc2a4
22,0011e0d4e6b944f998e987f904e8c1e5,576,0b1e1539f2cc45b7b9fa7c272da2e1d7
25,0011e0d4e6b944f998e987f904e8c1e5,252,2298d6c36e964ae4a3e7e9706d1fb8c2


In [42]:
#merge df_rcv_view and completed
df_time=pd.merge(df_rcv_view,df_completed, on=["customer_id", "offer_id"],how="left")
print(df_time.shape)
print(df_time.duplicated().sum()) 
df_time.head()

(101138, 5)
0


Unnamed: 0,customer_id,rcv_time,offer_id,view_time,comp_time
0,0009655768c64bdeb2e877511632db8f,576,2906b810c7d4411798c6938adc9daaa5,,576.0
1,0009655768c64bdeb2e877511632db8f,336,3f207df678b143eea3cee63160fa8bed,372.0,
2,0009655768c64bdeb2e877511632db8f,168,5a8bc65990b245e5a138643cd4eb9837,192.0,
3,0009655768c64bdeb2e877511632db8f,408,f19421c1d4aa40978ebb69ca19b0e20d,456.0,414.0
4,0009655768c64bdeb2e877511632db8f,504,fafdcd668e3743c1bb461111dcafc2a4,540.0,528.0


 A customer can complete an offer without viewing it. 

In [43]:
#avg diff between rcv_time and view time
average_difference_view = (df_time['view_time'] - df_time['rcv_time']).mean()
print(average_difference_view)

23.99345533030855


In [44]:
#avg diff between view_time and completed time
average_difference_comp = (df_time['comp_time'] - df_time['view_time']).mean()
print(average_difference_comp)

36.219029650528796


In [45]:
#avg diff between rcv_time and completed time
avg_diff_rcv_comp = (df_time['comp_time'] - df_time['rcv_time']).mean()
print(avg_diff_rcv_comp)

60.95792676198433


In [46]:
#import offer data
df_offer=pd.read_csv("cleaned_offers.csv")
df_offer.head()

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,"['email', 'mobile', 'social']"
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,"['web', 'email', 'mobile', 'social']"
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,"['web', 'email', 'mobile']"
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,"['web', 'email', 'mobile']"
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,"['web', 'email']"


In [47]:
#merge df_time and df_offer to add duration
df_time_duration=pd.merge(df_time,df_offer, on="offer_id",how="left")
df_time_duration = df_time_duration[['customer_id', "offer_id","rcv_time","view_time", 'comp_time', 'duration']]
print(df_time_duration.shape)
print(df_time_duration.duplicated().sum()) 
df_time_duration.head()

(101138, 6)
0


Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,duration
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,7
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,336,372.0,,4
2,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,168,192.0,,3
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,5
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,10


In [48]:
#convert duration from day to hours
df_time_duration.duration=df_time_duration["duration"]*24
df_time_duration.head()

Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,duration
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,168
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,336,372.0,,96
2,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,168,192.0,,72
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,120
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,240


In [49]:
#a valid offer completion time should be lower or equal than rcv_time+duration
df_time_duration["valid_comp_time"]=df_time_duration.rcv_time+df_time_duration.duration
df_time_duration.head()

Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,duration,valid_comp_time
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,168,744
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,336,372.0,,96,432
2,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,168,192.0,,72,240
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,120,528
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,240,744


In [50]:
#get valid status column
df_time_duration['status'] = np.where(
    df_time_duration['comp_time'] <= df_time_duration['valid_comp_time'],
    'valid',
    'invalid'
)
df_time_duration.head()

Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,duration,valid_comp_time,status
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,168,744,valid
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,336,372.0,,96,432,invalid
2,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,168,192.0,,72,240,invalid
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,120,528,valid
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,240,744,valid


In [51]:
#check status from one customer_id
df_time_duration[df_time_duration.customer_id=="e2127556f4f64592b11af22de27a7932"]

Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,duration,valid_comp_time,status
89253,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5,0,18.0,,168,168,invalid
89254,e2127556f4f64592b11af22de27a7932,3f207df678b143eea3cee63160fa8bed,336,,,96,432,invalid
89255,e2127556f4f64592b11af22de27a7932,9b98b8c7a33c4b65b9aebfe6a799e6d9,408,420.0,522.0,168,576,valid
89256,e2127556f4f64592b11af22de27a7932,fafdcd668e3743c1bb461111dcafc2a4,504,522.0,522.0,240,744,valid


In [52]:
#get valid data only
df_time_valid=df_time_duration[df_time_duration.status=="valid"]
print(df_time_valid.shape)
df_time_valid.head()

(53969, 8)


Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,duration,valid_comp_time,status
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,168,744,valid
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,120,528,valid
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,240,744,valid
5,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,240,648,valid
6,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,168,336,valid


In [53]:
#delete unnecessary columns
df_time_valid = df_time_valid[['customer_id', "offer_id","rcv_time","view_time", 'comp_time']]
print(df_time_valid.shape)
df_time_valid.head()

(53969, 5)


Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0
5,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0
6,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0


total offer_completion was 32070, however, we got 53969 rows. That means one offer_completion time got attached to multiple rcv_time and view_time

In [54]:
#add df_transaction to apply 2nd logic [the valid offer_completion will be when transaction time and completed time is same]
#get df_transaction
df_transactions=df_sorted[df_sorted.event=="transaction"]
df_transactions.rename(columns={'time': 'TT_time'}, inplace=True)
df_transactions = df_transactions[['customer_id', 'TT_time']]
print(df_transactions.shape)
df_transactions.head()

(123957, 2)


Unnamed: 0,customer_id,TT_time
12,0009655768c64bdeb2e877511632db8f,228
13,0009655768c64bdeb2e877511632db8f,414
14,0009655768c64bdeb2e877511632db8f,528
15,0009655768c64bdeb2e877511632db8f,552
16,0009655768c64bdeb2e877511632db8f,576


In [55]:
#merge df_transactions and df_time_valid
df_time_TT=pd.merge(df_time_valid,df_transactions, on="customer_id",how="left")
print(df_time_TT.shape)
print(df_time_TT.duplicated().sum()) 
df_time_TT.head()

(555248, 6)
0


Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,TT_time
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,228
1,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,414
2,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,528
3,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,552
4,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,576


In [56]:
#get valid status column
df_time_TT['status'] = np.where(
    df_time_TT['comp_time'] == df_time_TT['TT_time'],
    'valid',
    'invalid'
)
df_time_TT.head()

Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,TT_time,status
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,228,invalid
1,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,414,invalid
2,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,528,invalid
3,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,552,invalid
4,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,576,valid


In [57]:
#get valid data only
df_final_offers=df_time_TT[df_time_TT.status=="valid"]
print(df_final_offers.shape)
print(df_final_offers.duplicated().sum())
df_final_offers.head()

(53969, 7)
0


Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,TT_time,status
4,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,576,valid
9,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,414,valid
18,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,528,valid
26,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,576,valid
30,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,252,valid


In [58]:
#check status from one customer_id
df_final_offers[df_final_offers.customer_id=="e2127556f4f64592b11af22de27a7932"]

Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,TT_time,status
486268,e2127556f4f64592b11af22de27a7932,9b98b8c7a33c4b65b9aebfe6a799e6d9,408,420.0,522.0,522,valid
486271,e2127556f4f64592b11af22de27a7932,fafdcd668e3743c1bb461111dcafc2a4,504,522.0,522.0,522,valid


There was only one transaction at time 522. However, it became associated with two offer_id which can not happen.

In [68]:
df_final_offers[df_final_offers['comp_time'].duplicated(keep=False)]


Unnamed: 0,customer_id,offer_id,rcv_time,view_time,comp_time,TT_time,status
4,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0,576,valid
9,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0,414,valid
18,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0,528,valid
26,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,576,valid
30,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,252,valid
...,...,...,...,...,...,...,...
555181,ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,576,582.0,384.0,384,valid
555197,ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,576,582.0,414.0,414,valid
555215,ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,576,582.0,576.0,576,valid
555229,ffff82501cea40309d5fdd7edcca4a07,9b98b8c7a33c4b65b9aebfe6a799e6d9,504,534.0,504.0,504,valid


In [32]:
#get rows where transaction time and completed time is same
matching_pairs = pd.merge(
    df_transactions[['customer_id', 'time']].drop_duplicates(),
    df_completed[['customer_id', 'time']].drop_duplicates(),
    on=['customer_id', 'time'],
    how='inner'
)
matching_pairs

Unnamed: 0,customer_id,time
0,0610b486422d4921ae7d2bf64640c50b,528
1,78afa995795e4d85b5d9ceeca43f5fef,132
2,78afa995795e4d85b5d9ceeca43f5fef,510
3,e2127556f4f64592b11af22de27a7932,522
4,389bc3fa690240e798340f5a15918d5c,60
...,...,...
29576,9dc1421481194dcd9400aec7c9ae6366,360
29577,9dc1421481194dcd9400aec7c9ae6366,414
29578,9dc1421481194dcd9400aec7c9ae6366,594
29579,e4052622e5ba45a8b96b59aba68cf068,54


In [33]:
#filter df_customer_events
df_matching_rows = pd.merge(df_customer_events, matching_pairs, on=['customer_id', 'time'], how='inner')
df_matching_rows

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer completed,528,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0
2,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,132,,19.89,
3,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,offer completed,132,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0
4,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,510,,21.72,
...,...,...,...,...,...,...,...,...,...,...
66770,9dc1421481194dcd9400aec7c9ae6366,F,Elderly,Middle Income,2016,offer completed,594,ae264e3637204a6fb9bb56bc8210ddfd,,10.0
66771,e4052622e5ba45a8b96b59aba68cf068,F,Older Adult,Middle Income,2017,transaction,54,,21.55,
66772,e4052622e5ba45a8b96b59aba68cf068,F,Older Adult,Middle Income,2017,offer completed,54,2298d6c36e964ae4a3e7e9706d1fb8c2,,3.0
66773,e4052622e5ba45a8b96b59aba68cf068,F,Older Adult,Middle Income,2017,transaction,480,,30.57,


In [34]:
#get only the transaction and completed event
df_final=df_matching_rows[df_matching_rows.event.isin(["transaction","offer completed"])]
df_final

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer completed,528,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0
2,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,132,,19.89,
3,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,offer completed,132,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0
4,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,510,,21.72,
...,...,...,...,...,...,...,...,...,...,...
66770,9dc1421481194dcd9400aec7c9ae6366,F,Elderly,Middle Income,2016,offer completed,594,ae264e3637204a6fb9bb56bc8210ddfd,,10.0
66771,e4052622e5ba45a8b96b59aba68cf068,F,Older Adult,Middle Income,2017,transaction,54,,21.55,
66772,e4052622e5ba45a8b96b59aba68cf068,F,Older Adult,Middle Income,2017,offer completed,54,2298d6c36e964ae4a3e7e9706d1fb8c2,,3.0
66773,e4052622e5ba45a8b96b59aba68cf068,F,Older Adult,Middle Income,2017,transaction,480,,30.57,


In [35]:
#how many transaction from offers
df_final.event.value_counts()

event
offer completed    32070
transaction        29581
Name: count, dtype: int64

In [36]:
#total spend for transaction
df_final["amount"].sum()

np.float64(606355.75)

In [37]:
#total reward 
df_final["reward"].sum()

np.float64(158630.0)

In [38]:
#import offer data
df_offer=pd.read_csv("cleaned_offers.csv")
df_offer.head()

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,"['email', 'mobile', 'social']"
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,"['web', 'email', 'mobile', 'social']"
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,"['web', 'email', 'mobile']"
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,"['web', 'email', 'mobile']"
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,"['web', 'email']"


In [39]:
# Convert the numerical columns to strings before concatenating
df_offer['difficulty'] = df_offer['difficulty'].astype(str)
df_offer['reward'] = df_offer['reward'].astype(str)
df_offer['duration'] = df_offer['duration'].astype(str)

# Create the new column by concatenating the strings with hyphens
df_offer['offer_key'] = (
    df_offer['offer_type'] + '-' + 
    df_offer['difficulty'] + '-' + 
    df_offer['reward'] + '-' + 
    df_offer['duration']
)

# Display the DataFrame with the new column
print("DataFrame with the new 'offer_key' column:")
print(df_offer[['offer_id', 'offer_key']].head())

DataFrame with the new 'offer_key' column:
                           offer_id            offer_key
0  ae264e3637204a6fb9bb56bc8210ddfd         bogo-10-10-7
1  4d5c57ea9a6940dd891ad53e9dbe8da0         bogo-10-10-5
2  3f207df678b143eea3cee63160fa8bed  informational-0-0-4
3  9b98b8c7a33c4b65b9aebfe6a799e6d9           bogo-5-5-7
4  0b1e1539f2cc45b7b9fa7c272da2e1d7     discount-20-5-10


In [40]:
#load df_offer
df_offer.head()

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,channels,offer_key
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,"['email', 'mobile', 'social']",bogo-10-10-7
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,"['web', 'email', 'mobile', 'social']",bogo-10-10-5
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,"['web', 'email', 'mobile']",informational-0-0-4
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,"['web', 'email', 'mobile']",bogo-5-5-7
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,"['web', 'email']",discount-20-5-10


In [41]:
#merge df_final and df_offer
#merge df_customer and df_event
df_final_offers = pd.merge(df_final, df_offer, on='offer_id', how='left')
df_final_offers.head()

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward_x,offer_type,difficulty,reward_y,duration,channels,offer_key
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,,,,,,,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer completed,528,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0,bogo,5.0,5.0,7.0,"['web', 'email', 'mobile']",bogo-5-5-7
2,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,132,,19.89,,,,,,,
3,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,offer completed,132,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0,bogo,5.0,5.0,7.0,"['web', 'email', 'mobile']",bogo-5-5-7
4,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,510,,21.72,,,,,,,


In [42]:
#initial_report of df_final_offers
initial_report(df_final_offers)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 61651
- Total Columns: 16
- Column Names: ['customer_id', 'gender', 'age_group', 'income_group', 'membership_year', 'event', 'time', 'offer_id', 'amount', 'reward_x', 'offer_type', 'difficulty', 'reward_y', 'duration', 'channels', 'offer_key']

*** Data Types:
  customer_id: object
  gender: object
  age_group: object
  income_group: category
  membership_year: int64
  event: object
  time: int64
  offer_id: object
  amount: float64
  reward_x: float64
  offer_type: object
  difficulty: object
  reward_y: object
  duration: object
  channels: object
  offer_key: object

*** Mixed Data Types:
  offer_id:
    - str: 32070
    - float: 29581
  offer_type:
    - str: 32070
    - float: 29581
  difficulty:
    - str: 32070
    - float: 29581
  reward_y:
    - str: 32070
    - float: 29581
  duration:
    - str: 32070
    - float: 29581
  channels:
    - str: 32070
    - float: 29581
  offe

In [43]:
#drop reward_y and rename reward_x to reward
df_final_offers = df_final_offers.drop(columns=['reward_y'])
df_final_offers.rename(columns={'reward_x': 'reward'}, inplace=True)
df_final_offers.head()


Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward,offer_type,difficulty,duration,channels,offer_key
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,,,,,,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer completed,528,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0,bogo,5.0,7.0,"['web', 'email', 'mobile']",bogo-5-5-7
2,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,132,,19.89,,,,,,
3,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,offer completed,132,9b98b8c7a33c4b65b9aebfe6a799e6d9,,5.0,bogo,5.0,7.0,"['web', 'email', 'mobile']",bogo-5-5-7
4,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017,transaction,510,,21.72,,,,,,


In [44]:
#which offer type was more succesful
df_final_offers.offer_type.value_counts()

offer_type
discount    16970
bogo        15100
Name: count, dtype: int64

In [45]:
#which offer key was more succesful
df_final_offers.offer_key.value_counts()

offer_key
discount-10-2-10    4957
discount-7-3-7      4847
bogo-5-5-7          4141
bogo-5-5-5          4074
discount-10-2-7     3860
bogo-10-10-7        3604
discount-20-5-10    3306
bogo-10-10-5        3281
Name: count, dtype: int64