In [1]:
#import necessary library
import numpy as np # linear algebra
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.set_style('whitegrid') # set style for visualization
import warnings # ignore warnings
warnings.filterwarnings('ignore')

from initial_report import *

In [2]:
#import customer data
df_customer=pd.read_csv("cleaned_customer_data.csv")

In [3]:
#check df_customer sample
df_customer.sample(5)

Unnamed: 0,customer_id,became_member_on,gender,age,income
37,e9844d5beac04bf8b8d3ad18f7e37fca,2017-06-26,F,56,52000.0
2476,9295a791bb4949969b3e3f837c36a753,2017-11-19,M,23,41000.0
11087,683cc3d7e6974945ae2b60244d3c3b80,2014-01-31,M,63,68000.0
89,52f0bd64403f4958ab8fe9a6947c13c3,2016-01-26,F,45,44000.0
805,91b8bd3486414b79a3cf714f2f63d51b,2017-01-29,F,65,77000.0


In [4]:
#get initial report
initial_report(df_customer)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 14825
- Total Columns: 5
- Column Names: ['customer_id', 'became_member_on', 'gender', 'age', 'income']

*** Data Types:
  customer_id: object
  became_member_on: object
  gender: object
  age: int64
  income: float64

*** Mixed Data Types:

*** Distinct Values per Column:
  customer_id: 14825
  became_member_on: 1707
  gender: 3
  age: 84
  income: 91

*** Null Values and Percentages:


*** Duplicates: 0

*** Negative or Zero Values:

*** Basic Statistics:
                age         income
count  14825.000000   14825.000000
mean      54.393524   65404.991568
std       17.383705   21598.299410
min       18.000000   30000.000000
25%       42.000000   49000.000000
50%       55.000000   64000.000000
75%       66.000000   80000.000000
max      101.000000  120000.000000

*** Category Description:
                             customer_id became_member_on gender
count                       

1. became_member_on is date column

In [5]:
# Convert 'became_member_on' to datetime
df_customer['became_member_on'] = pd.to_datetime(df_customer['became_member_on'])
print(df_customer['became_member_on'].dtype)

datetime64[ns]


In [6]:
#check higest age and lowest age
print(df_customer.age.min(),df_customer.age.max())

18 101


In [7]:
#check highest and lowest income
print(df_customer.income.min(),df_customer.income.max())

30000.0 120000.0


In [8]:
# create bin for grouped analysis
df_customer['age_group'] = pd.cut(df_customer['age'], bins=[0, 30, 45, 60, 101], labels=['<30', '30-45', '45-60', '60+'])
df_customer['income_group'] = pd.qcut(df_customer['income'], q=4, labels=['low', 'mid-Low', 'mid-high', 'high'])
df_customer['membership_year'] = df_customer['became_member_on'].astype(str).str[:4].astype(int)
df_customer.sample(5)

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group,membership_year
9970,6336d3be58b24cd0b30f8ebade7247e9,2016-02-13,M,82,92000.0,60+,high,2016
2153,3577fe12bcdc4ae480feb3bd325f3f47,2017-06-25,M,50,42000.0,45-60,low,2017
11029,3061f75d2b3b4a92b6b97019341d4f19,2017-09-03,F,50,100000.0,45-60,high,2017
13208,e704d8eaf0674be1a38e6a09c65674d3,2017-12-25,M,53,61000.0,45-60,mid-Low,2017
2837,b93ac0e7c56a4bb8a4b379cc21befcba,2016-09-05,F,53,83000.0,45-60,high,2016


In [9]:
#drop unnecessary columns
df_customer_trimmed = df_customer.drop(columns=['became_member_on', 'age', "income"])
df_customer_trimmed.sample(5)

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year
9455,6dba14f698ae4030ab7354cd5cfe7119,M,45-60,mid-high,2016
1323,8c8b73983add4a828052e98775745f27,M,45-60,mid-Low,2016
8305,48e8eebb89814e9495460ebe04734cf8,F,60+,high,2016
9543,fce53ca9a0964a2fa0a2e9fcd1015a00,F,60+,low,2018
11501,0d20a089f54045539b2ee9b292c0cdc8,F,60+,high,2017


In [10]:
#check value counts for age group
df_customer_trimmed.age_group.value_counts()

age_group
60+      5542
45-60    4927
30-45    2651
<30      1705
Name: count, dtype: int64

In [11]:
#check value counts for income group
df_customer_trimmed.income_group.value_counts()

income_group
mid-Low     3863
low         3781
mid-high    3616
high        3565
Name: count, dtype: int64

In [12]:
#check value counts for membership year
df_customer_trimmed.membership_year.value_counts()

membership_year
2017    5599
2018    3669
2016    3024
2015    1597
2014     662
2013     274
Name: count, dtype: int64

In [13]:
#load df offer
df_offer=pd.read_csv("cleaned_offers.csv")

In [14]:
#show sample
df_offer.sample(5)

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,web,email,mobile,social
5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,3,7,1,1,1,1
8,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5,5,1,1,1,1
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,1,1,0,0
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,1,1,1,0
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,1,1,1,1


In [15]:
#get initial report
initial_report(df_offer)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 10
- Total Columns: 9
- Column Names: ['offer_id', 'offer_type', 'difficulty', 'reward', 'duration', 'web', 'email', 'mobile', 'social']

*** Data Types:
  offer_id: object
  offer_type: object
  difficulty: int64
  reward: int64
  duration: int64
  web: int64
  email: int64
  mobile: int64
  social: int64

*** Mixed Data Types:

*** Distinct Values per Column:
  offer_id: 10
  offer_type: 3
  difficulty: 5
  reward: 5
  duration: 5
  web: 2
  email: 1
  mobile: 2
  social: 2

*** Null Values and Percentages:


*** Duplicates: 0
ðŸ§± Constant Columns (no variance): ['email']

*** Negative or Zero Values:
  difficulty: 2
  reward: 2
  web: 2
  mobile: 1
  social: 4

*** Basic Statistics:
       difficulty     reward   duration        web  email     mobile  \
count   10.000000  10.000000  10.000000  10.000000   10.0  10.000000   
mean     7.700000   4.200000   6.500000   0.800000    1.0

In [16]:
#create offer_combo by adding offer_type, difficulty, reward, duration
df_offer['offer_combo'] = (
    df_offer['offer_type'].astype(str) + '-' +
    df_offer['difficulty'].astype(str) + '-' +
    df_offer['reward'].astype(str) + '-' +
    df_offer['duration'].astype(str)
)
df_offer.sample(5)

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,web,email,mobile,social,offer_combo
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,1,1,1,0,discount-10-2-7
8,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5,5,1,1,1,1,bogo-5-5-5
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,0,1,1,1,bogo-10-10-7
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,1,1,0,0,discount-20-5-10
5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,3,7,1,1,1,1,discount-7-3-7


In [17]:
#drop unnecessary columns
df_offer_trimmed = df_offer.drop(columns=['offer_type', 'difficulty', "reward","duration"])
df_offer_trimmed

Unnamed: 0,offer_id,web,email,mobile,social,offer_combo
0,ae264e3637204a6fb9bb56bc8210ddfd,0,1,1,1,bogo-10-10-7
1,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1,bogo-10-10-5
2,3f207df678b143eea3cee63160fa8bed,1,1,1,0,informational-0-0-4
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,1,0,bogo-5-5-7
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,1,0,0,discount-20-5-10
5,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1,discount-7-3-7
6,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1,discount-10-2-10
7,5a8bc65990b245e5a138643cd4eb9837,0,1,1,1,informational-0-0-3
8,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,bogo-5-5-5
9,2906b810c7d4411798c6938adc9daaa5,1,1,1,0,discount-10-2-7


In [18]:
#load df_event
df_event=pd.read_csv("cleaned_events.csv")
df_event.sample(5)

Unnamed: 0,customer_id,event,time,offer_id,amount,reward
174604,c018899eae214a7bbcd56aab6217a525,offer viewed,426,0b1e1539f2cc45b7b9fa7c272da2e1d7,,
297,79d6e0ec46a8486a9df7b24552be074b,offer received,0,f19421c1d4aa40978ebb69ca19b0e20d,,
230905,be33fc21689342f2b90a00a1892433bf,offer completed,534,ae264e3637204a6fb9bb56bc8210ddfd,,10.0
88846,5631d406ccde4e8099f1ac4ed77ac193,transaction,222,,10.26,
87605,41937f6173f443d3a5d5dfb5ad67746d,offer viewed,222,2906b810c7d4411798c6938adc9daaa5,,


In [19]:
df_event.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306137 entries, 0 to 306136
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   customer_id  306137 non-null  object 
 1   event        306137 non-null  object 
 2   time         306137 non-null  int64  
 3   offer_id     167184 non-null  object 
 4   amount       138953 non-null  float64
 5   reward       33182 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 14.0+ MB


In [20]:
#check events for a single random customer_id
df_event[df_event.customer_id=="0009655768c64bdeb2e877511632db8f"]

Unnamed: 0,customer_id,event,time,offer_id,amount,reward
55972,0009655768c64bdeb2e877511632db8f,offer received,168,5a8bc65990b245e5a138643cd4eb9837,,
77699,0009655768c64bdeb2e877511632db8f,offer viewed,192,5a8bc65990b245e5a138643cd4eb9837,,
89279,0009655768c64bdeb2e877511632db8f,transaction,228,,22.16,
113591,0009655768c64bdeb2e877511632db8f,offer received,336,3f207df678b143eea3cee63160fa8bed,,
139973,0009655768c64bdeb2e877511632db8f,offer viewed,372,3f207df678b143eea3cee63160fa8bed,,
153378,0009655768c64bdeb2e877511632db8f,offer received,408,f19421c1d4aa40978ebb69ca19b0e20d,,
168380,0009655768c64bdeb2e877511632db8f,transaction,414,,8.57,
168381,0009655768c64bdeb2e877511632db8f,offer completed,414,f19421c1d4aa40978ebb69ca19b0e20d,,5.0
187450,0009655768c64bdeb2e877511632db8f,offer viewed,456,f19421c1d4aa40978ebb69ca19b0e20d,,
204195,0009655768c64bdeb2e877511632db8f,offer received,504,fafdcd668e3743c1bb461111dcafc2a4,,


1. One customer is getting multiple offer in different time period
2. we canâ€™t directly link a transaction to a specific offer unless we logically associate them within the offerâ€™s active window.

In [21]:
#lets check whether all customers got one offer at least
# All unique customer IDs
all_customers = set(df_event['customer_id'].unique())

# Those who received offers
received_customers = set(df_event[df_event['event'] == 'offer received']['customer_id'])

# Those who never received an offer
no_offer_customers = all_customers - received_customers

# Double-check
print(f"Total in df_event: {len(all_customers)}")
print(f"Received offers: {len(received_customers)}")
print(f"Did NOT receive offers: {len(no_offer_customers)}")
print(f"Sum: {len(received_customers) + len(no_offer_customers)}")

Total in df_event: 17000
Received offers: 16994
Did NOT receive offers: 6
Sum: 17000


In [22]:
#lets check offer_viewed status

# All unique customers from df_event
all_customers = set(df_event['customer_id'].unique())

# Customers who viewed at least one offer
viewed_customers = set(df_event[df_event['event'] == 'offer viewed']['customer_id'])

# Customers who never viewed an offer
no_view_customers = all_customers - viewed_customers

# Output the counts
print(f"Total customers in df_event: {len(all_customers)}")
print(f"Customers who viewed at least one offer: {len(viewed_customers)}")
print(f"Customers who did NOT view any offer: {len(no_view_customers)}")
print(f"Sum: {len(viewed_customers) + len(no_view_customers)}")


Total customers in df_event: 17000
Customers who viewed at least one offer: 16834
Customers who did NOT view any offer: 166
Sum: 17000


In [23]:
#lets check offer_completed status

# All unique customers from df_event
all_customers = set(df_event['customer_id'].unique())

# Customers who completed at least one offer
completed_customers = set(df_event[df_event['event'] == 'offer completed']['customer_id'])

# Customers who never viewed an offer
no_completed_customers = all_customers - completed_customers

# Output the counts
print(f"Total customers in df_event: {len(all_customers)}")
print(f"Customers who completed at least one offer: {len(completed_customers)}")
print(f"Customers who did NOT complete any offer: {len(no_completed_customers)}")
print(f"Sum: {len(completed_customers) + len(no_completed_customers)}")

Total customers in df_event: 17000
Customers who completed at least one offer: 12774
Customers who did NOT complete any offer: 4226
Sum: 17000


In [24]:
total_received = df_event[df_event['event'] == 'offer received'].shape[0]
total_viewed   = df_event[df_event['event'] == 'offer viewed'].shape[0]
total_completed = df_event[df_event['event'] == 'offer completed'].shape[0]
total_transaction=df_event[df_event['event'] == 'transaction'].shape[0]

print(f"Total offers received: {total_received}")
print(f"Total offers viewed: {total_viewed}")
print(f"Total offers completed: {total_completed}")
print(f"Total Transaction: {total_transaction}")



Total offers received: 76277
Total offers viewed: 57725
Total offers completed: 33182
Total Transaction: 138953


In [25]:
#main objective is now to get offer status in one row for each customer for a specific offer_id
# Filter received event type
df_received = df_event[df_event['event'] == 'offer received'][['customer_id', 'offer_id', 'time']]
df_received.rename(columns={'time': 'received_time'}, inplace=True)
df_received.shape

(76277, 3)

In [26]:
# Filter viewed event type
df_viewed = df_event[df_event['event'] == 'offer viewed'][['customer_id', 'offer_id', 'time']]
df_viewed.rename(columns={'time': 'viewed_time'}, inplace=True)
df_viewed.shape


(57725, 3)

In [27]:
# Filter completed event type
df_completed = df_event[df_event['event'] == 'offer completed'][['customer_id', 'offer_id', 'time']]
df_completed.rename(columns={'time': 'completed_time'}, inplace=True)
df_completed.shape


(33182, 3)

In [28]:
# Filter transaction event type
df_transaction = df_event[df_event['event'] == 'transaction'][['customer_id', 'time',"amount"]]
df_transaction.rename(columns={'time': 'transaction_time'}, inplace=True)
df_transaction.shape


(138953, 3)

In [29]:
# Merge df_received and df_viewed
df_lifecycle = pd.merge(df_received, df_viewed, on=['customer_id', 'offer_id'],how="outer")
df_lifecycle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95321 entries, 0 to 95320
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customer_id    95321 non-null  object 
 1   offer_id       95321 non-null  object 
 2   received_time  95321 non-null  int64  
 3   viewed_time    79329 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.9+ MB


In [30]:
# Merge df_lifecycle and df_completed
df_lifecycle = pd.merge(df_lifecycle, df_completed, on=['customer_id', 'offer_id'],how="outer")
df_lifecycle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113934 entries, 0 to 113933
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   customer_id     113934 non-null  object 
 1   offer_id        113934 non-null  object 
 2   received_time   113934 non-null  int64  
 3   viewed_time     97380 non-null   float64
 4   completed_time  65722 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 4.3+ MB


In [31]:
#see df_lifecycle_data
df_lifecycle.head()

Unnamed: 0,customer_id,offer_id,received_time,viewed_time,completed_time
0,0009655768c64bdeb2e877511632db8f,2906b810c7d4411798c6938adc9daaa5,576,,576.0
1,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,336,372.0,
2,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,168,192.0,
3,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,408,456.0,414.0
4,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,504,540.0,528.0


In [32]:
# Add offer_type for context
df_lifecycle = df_lifecycle.merge(df_offer[['offer_id', 'offer_combo']], on='offer_id', how='left')
df_lifecycle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113934 entries, 0 to 113933
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   customer_id     113934 non-null  object 
 1   offer_id        113934 non-null  object 
 2   received_time   113934 non-null  int64  
 3   viewed_time     97380 non-null   float64
 4   completed_time  65722 non-null   float64
 5   offer_combo     113934 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 5.2+ MB


In [33]:
# Check lifecycle order validity: received_time < viewed_time < completed_time
df_lifecycle['valid_path'] = (
    df_lifecycle['viewed_time'] >= df_lifecycle['received_time']
) & (
    df_lifecycle['completed_time'] >= df_lifecycle['viewed_time']
)

# Fill NaNs in valid_path as False (e.g. missing view or complete)
df_lifecycle['valid_path'].fillna(False, inplace=True)

df_lifecycle.sample(5)

Unnamed: 0,customer_id,offer_id,received_time,viewed_time,completed_time,offer_combo,valid_path
108323,f33f934662044f889318b4e5410da827,2906b810c7d4411798c6938adc9daaa5,408,,,discount-10-2-7,False
26382,3b3e4f3238db42eebb04b1f658b55a9f,4d5c57ea9a6940dd891ad53e9dbe8da0,168,258.0,,bogo-10-10-5,False
98766,de0945c55cec445aa4d4132d3c6118d3,2298d6c36e964ae4a3e7e9706d1fb8c2,576,582.0,582.0,discount-7-3-7,True
98597,dd9b7e79bfad46c58d310f6bbcae5cee,3f207df678b143eea3cee63160fa8bed,504,588.0,,informational-0-0-4,False
58489,83a9176f7ea9473fb8b93dcac68003e5,9b98b8c7a33c4b65b9aebfe6a799e6d9,168,594.0,,bogo-5-5-7,False


In [34]:
#drop rows where vaid path is false
df_valid = df_lifecycle[df_lifecycle['valid_path'] == True].copy()

#reset index for cleaner display
df_valid.reset_index(drop=True, inplace=True)

# Preview the cleaned, valid offer lifecycle data
df_valid.head()

Unnamed: 0,customer_id,offer_id,received_time,viewed_time,completed_time,offer_combo,valid_path
0,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,discount-20-5-10,True
1,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,discount-7-3-7,True
2,0011e0d4e6b944f998e987f904e8c1e5,9b98b8c7a33c4b65b9aebfe6a799e6d9,504,516.0,576.0,bogo-5-5-7,True
3,0020c2b971eb4e9188eac86d93036a77,4d5c57ea9a6940dd891ad53e9dbe8da0,408,426.0,510.0,bogo-10-10-5,True
4,0020c2b971eb4e9188eac86d93036a77,fafdcd668e3743c1bb461111dcafc2a4,0,12.0,54.0,discount-10-2-10,True


In [35]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33395 entries, 0 to 33394
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customer_id     33395 non-null  object 
 1   offer_id        33395 non-null  object 
 2   received_time   33395 non-null  int64  
 3   viewed_time     33395 non-null  float64
 4   completed_time  33395 non-null  float64
 5   offer_combo     33395 non-null  object 
 6   valid_path      33395 non-null  bool   
dtypes: bool(1), float64(2), int64(1), object(3)
memory usage: 1.6+ MB


In [36]:
#as duration is an important element to get valid data, add duration from df_offer 
#  Add duration info to valid offers
df_valid = df_valid.merge(df_offer[['offer_id', 'duration']], on='offer_id', how='left')

# Calculate offer window
df_valid['offer_end_time'] = df_valid['received_time'] + (df_valid['duration']*24)

# Keep only transactions within the offer window
df_valid_final = df_valid[
    (df_valid['completed_time'] <= df_valid['offer_end_time'])
]
df_valid_final.head()

Unnamed: 0,customer_id,offer_id,received_time,viewed_time,completed_time,offer_combo,valid_path,duration,offer_end_time
0,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,discount-20-5-10,True,10,648
1,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,discount-7-3-7,True,7,336
2,0011e0d4e6b944f998e987f904e8c1e5,9b98b8c7a33c4b65b9aebfe6a799e6d9,504,516.0,576.0,bogo-5-5-7,True,7,672
3,0020c2b971eb4e9188eac86d93036a77,4d5c57ea9a6940dd891ad53e9dbe8da0,408,426.0,510.0,bogo-10-10-5,True,5,528
4,0020c2b971eb4e9188eac86d93036a77,fafdcd668e3743c1bb461111dcafc2a4,0,12.0,54.0,discount-10-2-10,True,10,240


In [37]:
#drop unnecessary column
df_event_alter = df_valid_final.drop(columns=['valid_path', 'duration',"offer_end_time"])
df_event_alter.head()

Unnamed: 0,customer_id,offer_id,received_time,viewed_time,completed_time,offer_combo
0,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,discount-20-5-10
1,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,discount-7-3-7
2,0011e0d4e6b944f998e987f904e8c1e5,9b98b8c7a33c4b65b9aebfe6a799e6d9,504,516.0,576.0,bogo-5-5-7
3,0020c2b971eb4e9188eac86d93036a77,4d5c57ea9a6940dd891ad53e9dbe8da0,408,426.0,510.0,bogo-10-10-5
4,0020c2b971eb4e9188eac86d93036a77,fafdcd668e3743c1bb461111dcafc2a4,0,12.0,54.0,discount-10-2-10


In [38]:
#get shape
df_event_alter.shape

(25214, 6)

In [39]:
# Merge on customer_id
df_with_txns = df_event_alter.merge(
    df_transaction,
    on='customer_id',
    how='left'
)


In [40]:

# Filter to only include transactions within the offer window
mask = (
    (df_with_txns['transaction_time'] >= df_with_txns['viewed_time']) &
    (df_with_txns['transaction_time'] <= df_with_txns['completed_time'])
)

df_with_txns= df_with_txns[mask]

#get sample
df_with_txns.head()

Unnamed: 0,customer_id,offer_id,received_time,viewed_time,completed_time,offer_combo,transaction_time,amount
2,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,408,432.0,576.0,discount-20-5-10,576,22.05
6,0011e0d4e6b944f998e987f904e8c1e5,2298d6c36e964ae4a3e7e9706d1fb8c2,168,186.0,252.0,discount-7-3-7,252,11.93
12,0011e0d4e6b944f998e987f904e8c1e5,9b98b8c7a33c4b65b9aebfe6a799e6d9,504,516.0,576.0,bogo-5-5-7,576,22.05
19,0020c2b971eb4e9188eac86d93036a77,4d5c57ea9a6940dd891ad53e9dbe8da0,408,426.0,510.0,bogo-10-10-5,510,17.24
23,0020c2b971eb4e9188eac86d93036a77,fafdcd668e3743c1bb461111dcafc2a4,0,12.0,54.0,discount-10-2-10,54,17.63


In [41]:
df_with_txns.shape

(37595, 8)

In [42]:
#rename
df_offer_response = df_with_txns

In [43]:
#save file
df_offer_response.to_csv('offer_response_data.csv', index=False)