## Part 3: Preprocessing Data 

In [1]:
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
customer = pd.read_csv('data/cleaned_customer.csv')
offer = pd.read_csv('data/cleaned_offer.csv')
transcript = pd.read_csv('data/cleaned_transcript.csv')

## Customer Characteristics
Before clustering, the customer and transcript datasets need to be combined so that the customer behaviors can be grouped.

Since the null values in Customer were removed in part 1, the `customer[customer_id]` should match `transcript[person]`

In [3]:
# Merge customer data with transcript data
merged_data = pd.merge(transcript, customer[['id']], left_on='person', right_on='id', how='inner')


# Group by 'person' and 'event' and count occurrences
grouped_transcript = merged_data.groupby(['person', 'event']).size().unstack(fill_value=0)
grouped_transcript.reset_index(inplace=True)

# Rename the columns for clarity
grouped_transcript.columns.name = None

# Merge with customer_behaviors
customer_behaviors = pd.merge(customer[['id']], grouped_transcript, left_on='id', right_on='person', how='left')
customer_behaviors.fillna(0, inplace=True)

# Cleaning up
customer_behaviors.drop(columns=['id'], inplace=True)
customer_behaviors.rename(columns={'person': 'id', 
                                   'offer received': 'received', 
                                   'offer viewed': 'viewed', 
                                   'offer completed': 'completed', 
                                   'transaction': 'transactions'}, inplace=True)

# Merge money_spent with customer_behaviors
money_spent = merged_data.groupby('person')['money_spent'].sum().reset_index()
money_spent.rename(columns={'person': 'id'}, inplace=True)
customer_behaviors = pd.merge(customer_behaviors, money_spent, on='id')

#customer_behaviors.set_index('id', inplace=True)
customer_behaviors.head()

Unnamed: 0,id,completed,received,viewed,transactions,money_spent
0,0610b486422d4921ae7d2bf64640c50b,1,2,0,3,77.01
1,78afa995795e4d85b5d9ceeca43f5fef,3,4,4,7,159.27
2,e2127556f4f64592b11af22de27a7932,2,4,3,3,57.73
3,389bc3fa690240e798340f5a15918d5c,5,6,6,3,36.43
4,2eeac8d8feae4a8cad5a6af0499a211d,1,3,2,4,15.62


In [4]:
customer_behaviors.to_csv('data/customer_behaviors.csv', index = False)

## Offer Types

In [4]:
offer_transcripts = transcript[['person', 'event', 'hours_since_start', 'value_id']].query("event != 'transaction'")

In [5]:
offer.rename(columns={'id': 'value_id'}, inplace=True)
offer_transcripts = pd.merge(offer_transcripts, offer, on = 'value_id')
offer_transcripts.head()

Unnamed: 0,person,event,hours_since_start,value_id,reward,channels,difficulty,duration,offer_type,offer_code,channels_code
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,"['web', 'email', 'mobile']",5,7,bogo,A,B
1,ebe7ef46ea6f4963a7dd49f501b26779,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,"['web', 'email', 'mobile']",5,7,bogo,A,B
2,f082d80f0aac47a99173ba8ef8fc1909,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,"['web', 'email', 'mobile']",5,7,bogo,A,B
3,c0d210398dee4a0895b24444a5fcd1d2,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,"['web', 'email', 'mobile']",5,7,bogo,A,B
4,57dd18ec5ddc46828afb81ec5977bef2,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,"['web', 'email', 'mobile']",5,7,bogo,A,B


In [7]:
offer_transcripts.to_csv('data/offer_transcripts.csv', index = False)