# Data Wrangling - Responses

The purpose of this notebook is to prepare the transcript data to provide a usable dataset for data exploration and modeling steps. The output of this notebook is a csv file containing data on whether a user responds Yes or No to an offer type.
The cleaning steps are:
1. Separate the dataset into 4 separate dataframes, 'received', 'viewed', 'completed', 'transactions'
2. Merge and filter the 4 dataframes back into one dataframe

## Import libraries and datasets 

In [1]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [2]:
# Convert 'value' column into separate columns
transcript_values = json_normalize(transcript['value'])
transcript = pd.concat([transcript, transcript_values],axis=1)
transcript = transcript.drop(columns=['value'])

  


In [3]:
# Combine 'offer id' and 'offer_id' columns
transcript['offer_id'] = transcript['offer_id'].fillna(transcript['offer id'])
transcript = transcript.drop(columns='offer id')

In [4]:
# Separate the dataset into 4 separate dataframes, 'received', 'viewed', 'completed', 'transactions'
received = transcript[transcript['event']=='offer received']
viewed = transcript[transcript['event']=='offer viewed']
completed = transcript[transcript['event']=='offer completed']
transaction = transcript[transcript['event']=='transaction']

In [5]:
# Merge received and viewed dataframes
received_w_viewed = pd.merge(received[['person','offer_id','time']], 
                             viewed[['person','offer_id','time']], 
                             how='left', 
                             on=['person', 'offer_id'])
received_w_viewed = received_w_viewed.rename(columns={"time_x": "received_time", 
                                                      "time_y": "viewed_time"})

In [6]:
view_after_receive = received_w_viewed[received_w_viewed.viewed_time >= received_w_viewed.received_time]
view_after_receive = view_after_receive.drop_duplicates(subset=['person','offer_id','received_time'])

In [7]:
# Merge received and viewed and completed
completed_w_viewed = pd.merge(view_after_receive, 
                             completed[['person','offer_id','time','reward']], 
                             how='left', 
                             on=['person', 'offer_id'])
completed_w_viewed = completed_w_viewed.rename(columns={'time':'completed_time'})

In [8]:
complete = completed_w_viewed[~(completed_w_viewed.completed_time<completed_w_viewed.viewed_time)]

In [9]:
complete = complete.drop_duplicates(subset=['person','offer_id','received_time','viewed_time'])

In [10]:
complete = complete.drop_duplicates(subset=['person','offer_id','completed_time'], keep='last')

In [11]:
# Rename 'id' column to 'offer_id' to match transcript
portfolio = portfolio.rename(columns={'id':'offer_id'})

In [12]:
# Merge portfolio and completed
df = pd.merge(complete, 
             portfolio[['offer_id','duration','offer_type']], 
             how='left', 
             on=['offer_id'])

In [13]:
df['completed_offer'] =  (df.completed_time != None) & (df.viewed_time != None) & ((df.completed_time - df.received_time)/24 < df.duration) & (df.viewed_time < df.completed_time) 

In [14]:
offer_response = df[['person','completed_offer','offer_type']]

In [15]:
# export offer completion data to csv
offer_response.to_csv('data/offer_response.csv',index=False) 