# Part 1 - Obtaining and formatting Discover Data

In [2]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_columns", 100)

In [10]:
# import discover data, and filter for list of emails according to their earliest use of discover
discover_data = pd.read_csv("discover_print_to_csv.csv")
discover_data['timestamp'] = discover_data['timestamp'].map(pd.to_datetime)

# In request.concerns column, both sensitive skin status and skin types are appended together with user's concerns.
# Need to use pandas apply to remove those entries

# However, we need to first convert each element in the request.concerns from "strings (of list)" to "list" type 

# In order to do that, we need to fill blank entries with an empty list in order for the function ast.literal_eval() to be passed through successfully

columns_to_convert_list = ['concerns_addressed','match','request.concerns','reviews']

for column in columns_to_convert_list:
    
    discover_data[column] = discover_data[column].fillna("[]").map(lambda x: ast.literal_eval(x))

def remove_skin_types(x):
    terms_to_delete = ['Sensitivity', 'Combination','Dry to Very Dry', 'Normal', 'Oily']
    replacement_list = []
    for entry in x:
        if entry in terms_to_delete:
            del entry
        else:
            replacement_list.append(entry)
    return replacement_list
discover_data['request.concerns'] = discover_data['request.concerns'].map(remove_skin_types)

# print(type(discover_data['timestamp'][0]))
discover_data.head()

TypeError: Only booleans, lists, and dictionaries are accepted for the 'parse_dates' parameter

## Explore basic characteristics of Discover data

In [5]:
discover_data['request.gender'].value_counts()

2    35518
1     4990
0     1963
Name: request.gender, dtype: int64

In [8]:
# discover_data.to_csv('firebase_discover_formatted.csv')

In [9]:
# examine number of people who retake the test

email_timestamp_unique = discover_data[['email', 'timestamp']].drop_duplicates(keep='last')
email_timestamp_unique

test_take_count = pd.pivot_table(email_timestamp_unique, index= 'email', aggfunc='count')
total_submissions_count = len(email_timestamp_unique)
print('total_submissions_count:', total_submissions_count)
retakers_count = test_take_count[test_take_count['timestamp'] >1 ].count()
print('retakers_count:', retakers_count)
print('Percentage of retakers:', retakers_count/total_submissions_count)

total_submissions_count: 3435
retakers_count: timestamp    486
dtype: int64
Percentage of retakers: timestamp    0.141485
dtype: float64


In [10]:
discover_data[['timestamp']].drop_duplicates(keep='last').count()

timestamp    3432
dtype: int64

In [11]:
# Male Female Breakdown
email_gender_unique = discover_data[['email', 'request.gender']].drop_duplicates(keep='last')
male_email_count = len(email_gender_unique[email_gender_unique['request.gender'] == 1])
print('male_email_count:', male_email_count)
female_email_count = len(email_gender_unique[email_gender_unique['request.gender'] == 2])
print('female_email_count:', female_email_count)
female_percentage = female_email_count/(male_email_count + female_email_count)
print('female_percentage:', female_percentage)

male_email_count: 297
female_email_count: 2163
female_percentage: 0.8792682926829268


In [12]:
len(email_gender_unique[email_gender_unique['request.gender'] == 0])

134

In [13]:
# Age range distribution
email_age_unique = discover_data[['email', 'request.ageRange']].drop_duplicates(keep='last')
age_range_breakdown = email_age_unique.groupby('request.ageRange').count() #apply(lambda x: 100 * x / x.sum())
age_range_breakdown['Percentage'] =age_range_breakdown.apply(lambda x: 100 * x / x.sum())
age_range_breakdown # age_range_breakdown['Percentage'] = age_range_breakdown['email'].map(lambda x: 100 * x / x.sum())

Unnamed: 0_level_0,email,Percentage
request.ageRange,Unnamed: 1_level_1,Unnamed: 2_level_1
20.0,142,6.454545
30.0,1113,50.590909
40.0,768,34.909091
60.0,128,5.818182
70.0,49,2.227273


In [14]:
age_range_breakdown.sum()

email         2200.0
Percentage     100.0
dtype: float64

In [15]:
len(email_age_unique[pd.isnull(email_age_unique['request.ageRange']) == True])

373

# Part 2 - Match Discover takers with Sales Data
## Objectives:
### 1. Obtain a list SKUs bought right after they have taken Discover
### 2. Match the list with the skin profile (and any other relevant "features")

In [None]:
# Import shopify sales data
sales_data = pd.read_csv("shopify_orders_export_20180207.csv", 
                         low_memory=False, 
                         parse_dates=['Paid at', 'Fulfilled at', 'Created at'])

sales_data_clean = sales_data.drop(sales_data.columns.to_series()[-11:-1], axis=1)

# remove in-store purchases by dropping NA rows in Email column
sales_data_clean.dropna(subset=['Email'], axis=0, inplace=True)

# add columns referencing each customer's FIRST use of discover
sales_data_clean['discover_first_date'] = sales_data_clean['Email'].map(discover_first.set_index('email')['timestamp2'])
sales_data_clean['used_discover_already'] = (sales_data_clean['Created at']> \
                                             sales_data_clean['discover_first_date']).\
                                                map({True: "Used Discover", False: "Not yet"})
sales_data_clean['discover_sales_lead_time'] = sales_data_clean['Created at'] - sales_data_clean['discover_first_date']

# create sales dataset filtering customers who had used discover
# regardless of whether they are existing or completely new customers
post_discover_sales = sales_data_clean[sales_data_clean['used_discover_already'] == "Used Discover"]

# create sales dataset filtering customers who have never used discover and
# and also those customers' purchase histories right before they use discover
pre_discover_sales = sales_data_clean[~(sales_data_clean['used_discover_already'] == "Used Discover")]
