# Part 1 - Obtaining and formatting Discover Data

In [4]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_columns", 100)

In [28]:
# import discover data, and filter for list of emails according to their earliest use of discover
discover_data = pd.read_csv("discover_print_to_csv.csv")
discover_data['timestamp'] = discover_data['timestamp'].map(pd.to_datetime)

# In request.concerns column, both sensitive skin status and skin types are appended together with user's concerns.
# Need to use pandas apply to remove those entries

# However, we need to first convert each element in the request.concerns from "strings (of list)" to "list" type 

# In order to do that, we need to fill blank entries with an empty list in order for the function ast.literal_eval() 
# to be passed through successfully

columns_to_convert_list = ['concerns_addressed','match','request.concerns','reviews']

for column in columns_to_convert_list:
    
    discover_data[column] = discover_data[column].fillna("[]").map(lambda x: ast.literal_eval(x))

def remove_skin_types(x):
    terms_to_delete = ['Sensitivity', 'Combination','Dry to Very Dry', 'Normal', 'Oily']
    replacement_list = []
    for entry in x:
        if entry in terms_to_delete:
            del entry
        else:
            replacement_list.append(entry)
    return replacement_list
discover_data['request.concerns'] = discover_data['request.concerns'].map(remove_skin_types)

# Remove entries without gender indicated
discover_data = discover_data[~(discover_data['request.gender'] == 0) & (discover_data['Serial'] <9)]

# discover_data.to_csv('firebase_discover_formatted.csv')
# print(type(discover_data['timestamp'][0]))

In [29]:
discover_data

Unnamed: 0,Serial,Name,SKU,auto_include,concerns_addressed,concerns_addressed_count,email,handle,match,min_req,rating,request.ageRange,request.concerns,request.gender,request.name,request.sensitivity,request.skinType,reviews,timestamp,type
0,0,Earth Sourced Gentle Cleansing Gel,8500,,[Sensitivity],1,yaminnphyu@mail.com,earth-sourced-perfectly-natural-cleansing-gel,[Sensitivity],['Sensitivity'],95,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 15, 12, 11, 31, 96]",2017-09-10 03:27:26,Cleanser
1,1,Earth Sourced Purely Natural Refreshing Toner,8510,,"[Dehydration, Sensitivity]",1,yaminnphyu@mail.com,earth-sourced-purely-natural-refreshing-toner,[Sensitivity],['Sensitivity'],83,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 9, 5, 7, 22, 76]",2017-09-10 03:27:26,Toner
2,2,Clear Regular Strength Anti-Redness Exfoliatin...,6200,,"[Clogged Pores, Uneven Texture, Enlarged Pores...",5,yaminnphyu@mail.com,clear-regular-strength-anti-redness-exfoliatin...,"[Enlarged Pores, PIH, Redness, Sensitivity, Acne]",['Acne'],85,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 16, 20, 23, 47, 214]",2017-09-10 03:27:26,Exfoliant
3,3,Resist Ultra-Light Super Antioxidant Concentra...,7740,,"[Dullness, Wrinkles, PIH, Enlarged Pores, Unev...",8,yaminnphyu@mail.com,resist-ultra-light-super-antioxidant-concentra...,"[Dullness, PIH, Enlarged Pores, Oily, Dullness...","['Dehydration', 'Clogged Pores', 'Enlarged Por...",79,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 20, 27, 23, 37, 248]",2017-09-10 03:27:26,Serum
4,4,Resist C15 Super Booster,7770,,"[Dehydration, PIH, Redness, Uneven Texture, Wr...",4,yaminnphyu@mail.com,resist-c15-super-booster,"[PIH, Redness, Dullness, Sun Damage]",['None'],74,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 53, 60, 65, 84, 427]",2017-09-10 03:27:26,Booster
5,5,Clear Ultra-Light Daily Mattifying Fluid SPF 30,6130,,"[Clogged Pores, Enlarged Pores, Acne]",2,yaminnphyu@mail.com,clear-ultra-light-daily-mattifying-fluid-spf30,"[Enlarged Pores, Acne]",['Acne'],87,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 11, 14, 22, 20, 67]",2017-09-10 03:27:26,SPF
6,6,Skin Balancing Invisible Finish Moisture Gel,3400,,[Enlarged Pores],1,yaminnphyu@mail.com,skin-balancing-invisible-finish-moisture-gel,[Enlarged Pores],"['Combination', 'Oily']",80,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 21, 20, 27, 50, 219]",2017-09-10 03:27:26,Moisturizer
7,7,Whitening Mask,2760,,"[Dehydration, Dullness, PIH, Sun Damage]",3,yaminnphyu@mail.com,radiance-renewal-whitening-mask,"[Dullness, PIH, Sun Damage]",['None'],77,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 5, 5, 10, 2, 50]",2017-09-10 03:27:26,Mask
8,8,Resist Weightless Body Treatment w/ 2% BHA (Sa...,5700,,"[Clogged Pores, Uneven Texture, Enlarged Pores...",3,yaminnphyu@mail.com,resist-weightless-body-treatment-2-bha,"[Enlarged Pores, Redness, Acne]",['None'],89,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 20, 26, 36, 55, 295]",2017-09-10 03:27:26,Body
13,0,Earth Sourced Gentle Cleansing Gel,8500,,[Sensitivity],1,yaminnphyu@mail.com,earth-sourced-perfectly-natural-cleansing-gel,[Sensitivity],['Sensitivity'],95,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, PI...",2,Yaminn Phyu,True,Oily,"[None, 15, 12, 11, 31, 96]",2017-09-10 03:29:06,Cleanser


In [83]:
single_row_recommendations = pd.DataFrame(discover_data.groupby(by = ['email', 
                                                    'timestamp',
                                                     'request.ageRange',
                                                      'request.gender',
                                                         'request.name',
                                                    'request.skinType'])['SKU'].apply(list)).reset_index()
single_row_recommendations = single_row_recommendations.drop_duplicates('email', keep='last')
single_row_recommendations.sort_values('timestamp')

Unnamed: 0,email,timestamp,request.ageRange,request.gender,request.name,request.skinType,SKU
2707,yaminnphyu@mail.com,2017-09-10 03:29:06,30.0,2,Yaminn Phyu,Oily,"[8500, 8510, 6200, 7740, 7770, 6130, 3400, 276..."
1863,littlelucky2000@gmail.com,2017-09-10 03:54:12,40.0,2,gem,Combination,"[7830, 1350, 6200, 7740, 7860, 6130, 7690, 275..."
437,Mukhacho@gmail.com,2017-09-10 04:01:06,30.0,2,Wyne,Combination,"[1150, 1350, 2010, 7740, 7870, 7800, 7690, 275..."
2109,nyc920319@gmail.com,2017-09-10 04:02:22,30.0,1,Yi Cheng,Combination,"[1150, 1350, 8720, 7740, 7980, 6130, 3400, 275..."
2687,wps-service@hotmail.com,2017-09-10 04:04:44,30.0,2,elainewong,Combination,"[6002, 1350, 6200, 7740, 7870, 6130, 7690, 275..."
604,Stacygan1010@gmail.com,2017-09-10 04:06:05,40.0,2,Stacy,Combination,"[7830, 8510, 6200, 7740, 7860, 6130, 9180, 275..."
379,Lemondropz_3@hotmail.com,2017-09-10 04:06:07,20.0,2,Colleen,Oily,"[7830, 7670, 2040, 7740, 7860, 6130, 7690, 276..."
596,Soh_nee@yahoo.com.sg,2017-09-10 04:06:10,40.0,2,Amanda,Combination,"[1150, 7670, 7820, 7740, 7870, 6130, 7690, 275..."
1034,charmaine_287@hotmail.com,2017-09-10 04:08:00,30.0,2,Charmaine,Combination,"[7830, 1350, 2010, 7740, 7770, 6130, 7690, 276..."
284,Jacquelyncheongemail@gmail.com,2017-09-10 04:09:14,30.0,2,Jacquelyn,Combination,"[7830, 7670, 6200, 7740, 7770, 6130, 7690, 276..."


In [88]:
# Import shopify sales data
sales_data = pd.read_csv("shopify_orders_export_20180207.csv", 
                         low_memory=False, 
                         parse_dates=['Paid at', 'Fulfilled at', 'Created at'])

sales_data_clean = sales_data.drop(sales_data.columns.to_series()[-11:-1], axis=1)
sales_data_clean.dropna(subset=['Email'], axis=0, inplace=True)

sales_data_clean['discover_first_date'] = sales_data_clean['Email'].map(single_row_recommendations.set_index('email')['timestamp'])
sales_data_clean['used_discover_already'] = (sales_data_clean['Created at']> sales_data_clean['discover_first_date']).map({True: "Used Discover", False: "Not yet"})
sales_data_clean['discover_sales_lead_time'] = sales_data_clean['Created at'] - sales_data_clean['discover_first_date']

sales_data_clean =  sales_data_clean.merge(single_row_recommendations, left_on= 'Email', right_on='email',
                                           how='left')
pre_discover_sales = sales_data_clean[sales_data_clean['Created at']< "2017-09-09"]
post_discover_sales = sales_data_clean[sales_data_clean['Created at']>= "2017-09-09"]


In [102]:
post_discover_sales.fillna()

Unnamed: 0,Name,Email,Financial Status,Paid at,Fulfillment Status,Fulfilled at,Accepts Marketing,Currency,Subtotal,Shipping,Taxes,Total,Discount Code,Discount Amount,Shipping Method,Created at,Lineitem quantity,Lineitem name,Lineitem price,Lineitem compare at price,Lineitem sku,Lineitem requires shipping,Lineitem taxable,Lineitem fulfillment status,Billing Name,Billing Street,Billing Address1,Billing Address2,Billing Company,Billing City,Billing Zip,Billing Province,Billing Country,Billing Phone,Shipping Name,Shipping Street,Shipping Address1,Shipping Address2,Shipping Company,Shipping City,Shipping Zip,Shipping Province,Shipping Country,Shipping Phone,Notes,Note Attributes,Cancelled at,Payment Method,Payment Reference,Refunded Amount,Vendor,Outstanding Balance,Employee,Location,Device ID,Id,Tags,Risk Level,Source,Lineitem discount,Phone,discover_first_date,used_discover_already,discover_sales_lead_time,email,timestamp,request.ageRange,request.gender,request.name,request.skinType,SKU,Buy Count
0,191569914712,gilly.glanville@me.com,paid,2018-02-08 03:59:00,fulfilled,2018-02-08 03:59:01,yes,SGD,290.0,0.0,0.0,290.0,5OFFe53b8c2fb9df,5.0,,2018-02-08 03:59:00,1,Resist Skin Restoring Moisturizer SPF 50 - 60ml,48.0,0.0,7970,True,False,fulfilled,Gilly Glanville,,,,,,,,,,,,,,,,,,,,,,,External Credit,c563530104851.1,0.0,Paula's Choice,0.0,Jeremy Tan,Beauty Collective,9.0,1.811065e+11,,Low,pos,0.0,,NaT,Not yet,NaT,,NaT,,,,,,1
5,191569914711,stephaniedata@yahoo.com,paid,2018-02-08 03:39:25,unfulfilled,NaT,no,SGD,38.0,5.5,0.0,43.5,5OFF5359ca0ded5b,5.0,2 Working-Day Delivery,2018-02-08 03:39:25,1,Clear Acne Extra Strength Exfoliating Treatmen...,43.0,,6210,True,False,pending,Stephanie Ng,"6 Napier Road 07-05, Gleneagles Medical Centre...",6 Napier Road 07-05,Gleneagles Medical Centre 07-05,The Dermatology Practice at Gleneagles,Singapore,'258499,,SG,9001 9072,Stephanie Ng,"6 Napier Road 07-05, Gleneagles Medical Centre...",6 Napier Road 07-05,Gleneagles Medical Centre 07-05,The Dermatology Practice at Gleneagles,Singapore,'258499,,SG,9001 9072,,,,Stripe,c531555713043.1,0.0,Paula's Choice,0.0,,,,1.810939e+11,,Low,web,0.0,6.590019e+09,NaT,Not yet,NaT,,NaT,,,,,,6
6,191569914710,hsmeaton@hotmail.com,paid,2018-02-08 00:42:57,unfulfilled,NaT,yes,SGD,48.0,9.0,0.0,57.0,,0.0,Free Shipping above S$99 (ex.Taxes & Fees),2018-02-08 00:42:56,1,Calm Sensitive Daytime Moisturizer SPF 30 (Nor...,48.0,,9130,True,False,pending,Heather Smeaton,"C3-4-1 Pantai Hillpark Phase 1, Jln Pantai Dalam",C3-4-1 Pantai Hillpark Phase 1,Jln Pantai Dalam,,Kuala Lumpur,'59200,KUL,MY,'+60 12-395 5259,Heather Smeaton,"C3-4-1 Pantai Hillpark Phase 1, Jln Pantai Dalam",C3-4-1 Pantai Hillpark Phase 1,Jln Pantai Dalam,,Kuala Lumpur,'59200,KUL,MY,'+60 12-395 5259,,,,Stripe,c563126239251.1,0.0,Paula's Choice,0.0,,,,1.809537e+11,,Low,web,0.0,,NaT,Not yet,NaT,,NaT,,,,,,7
7,191569914709,litepiggy@yahoo.com.sg,paid,2018-02-07 14:58:31,unfulfilled,NaT,yes,SGD,75.0,0.0,0.0,75.0,,0.0,Free Delivery (2 Working Days),2018-02-07 14:58:31,30,Resist Ultra-Light Super Antioxidant Concentra...,2.5,,7749,True,False,pending,Christina Wu,Blk 612D Punggol Drive #14-867,Blk 612D Punggol Drive #14-867,,,Singapore,'824612,,SG,9781 5411,Christina Wu,Blk 612D Punggol Drive #14-867,Blk 612D Punggol Drive #14-867,,,Singapore,'824612,,SG,9781 5411,,,,Stripe,c561475354643.1,0.0,Paula's Choice,0.0,,,,1.803134e+11,,Low,web,0.0,,NaT,Not yet,NaT,,NaT,,,,,,8
8,191569914708,alyssacmy@gmail.com,paid,2018-02-07 13:45:51,unfulfilled,NaT,yes,SGD,42.0,5.5,0.0,47.5,5OFF22c18122b276,5.0,2 Working-Day Delivery,2018-02-07 13:45:51,1,Skin Perfecting 2% BHA (Salicylic Acid) Liquid...,13.0,,2017,True,False,pending,Alyssa Chee,"1 Kaki Bukit View, #04-18 , Techview","1 Kaki Bukit View, #04-18 , Techview",,,Singapore,'415941,,SG,83331068,Alyssa Chee,"1 Kaki Bukit View, #04-18 , Techview","1 Kaki Bukit View, #04-18 , Techview",,,Singapore,'415941,,SG,83331068,,,,Stripe,c561255120915.1,0.0,Paula's Choice,0.0,,,,1.802375e+11,,Low,web,0.0,,NaT,Not yet,NaT,,NaT,,,,,,9
10,191569914707,jglyj82@gmail.com,paid,2018-02-07 13:02:33,unfulfilled,NaT,yes,SGD,82.0,0.0,0.0,82.0,,0.0,Free Delivery (2 Working Days),2018-02-07 13:02:33,1,Skin Balancing Oil-Reducing Cleanser - 237 ml,34.0,0.0,1150,True,False,pending,Joanne Tan,13367 Holly Oak Cir,13367 Holly Oak Cir,,,Cerritos,'90703,CA,US,(562) 404-7973,Joanne Tan,"23 Stevens Drive #03-01, Parc Stevens",23 Stevens Drive #03-01,Parc Stevens,,Singapore,'257914,,SG,9109 2164,,,,Stripe,c561204592659.1,0.0,Paula's Choice,0.0,,,,1.801691e+11,,Low,web,0.0,,2018-02-07 12:54:54,Used Discover,0 days 00:07:39,jglyj82@gmail.com,2018-02-07 12:54:54,40.0,2.0,Joanne Tan,Combination,"[1150, 7670, 2010, 7740, 7870, 7800, 3400, 275...",1
12,191569914706,yeoslfiona@hotmail.com,paid,2018-02-07 11:55:12,fulfilled,2018-02-07 11:55:12,yes,SGD,217.0,0.0,0.0,217.0,,0.0,,2018-02-07 11:55:12,1,Resist for Wrinkles + Breakouts Exfoliant & Mo...,86.0,0.0,4970,True,False,fulfilled,Fiona Yeo,,,,,,,,,,,,,,,,,,,,,,,External Credit,c561082662931.1,0.0,Paula's Choice,0.0,Jeremy Tan,Beauty Collective,9.0,1.801244e+11,,Low,pos,0.0,,NaT,Not yet,NaT,,NaT,,,,,,11
16,191569914705,vinothini.112@gmail.com,paid,2018-02-07 11:33:14,fulfilled,2018-02-07 11:33:14,yes,SGD,34.0,0.0,0.0,34.0,,0.0,,2018-02-07 11:33:14,1,Skin Balancing Pore-Reducing Toner - 190 ml,34.0,0.0,1350,True,False,fulfilled,Vinothini Subramanian,,,,,,,,,,,,,,,,,,,,,,,External Debit,c561050288147.1,0.0,Paula's Choice,0.0,Jeremy Tan,Beauty Collective,9.0,1.801127e+11,,Low,pos,0.0,,NaT,Not yet,NaT,,NaT,,,,,,15
17,191569914704,wailing_93@hotmail.com,paid,2018-02-07 11:25:32,fulfilled,2018-02-07 11:25:32,yes,SGD,84.0,0.0,0.0,84.0,,0.0,,2018-02-07 11:25:32,1,Clear Acne Regular Strength Travel Kit,29.0,0.0,4707,True,False,fulfilled,Wai Ling Choy,,,,,,,,,,,,,,,,,,,,,,,External Credit,c561044914195.1,0.0,Paula's Choice,0.0,Jeremy Tan,Beauty Collective,9.0,1.801082e+11,,Low,pos,0.0,,NaT,Not yet,NaT,,NaT,,,,,,16
19,191569914703,jesse.dytioco@gmail.com,paid,2018-02-07 11:20:49,fulfilled,2018-02-07 11:20:50,yes,SGD,170.0,0.0,0.0,170.0,,0.0,,2018-02-07 11:20:49,1,Calm Repairing Sensitive Serum - 30 ml,51.0,0.0,3700,True,False,fulfilled,Jesse Dytioco,,,,,,,,,,,,,,,,,,,,,,,External Credit,c560960012307.1,0.0,Paula's Choice,0.0,Jeremy Tan,Beauty Collective,9.0,1.801051e+11,,Low,pos,0.0,,NaT,Not yet,NaT,,NaT,,,,,,18


In [106]:
# Now, serialize the transactions done by each customer

post_discover_sales['Buy Count'] = post_discover_sales.sort_values('Created at').dropna(subset= ['Paid at']).groupby(['email'])['Paid at'].cumcount()+1
post_discover_sales['Buy Count'] = post_discover_sales['Buy Count'].ffill()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [114]:
post_discover_sales[post_discover_sales['email'] == 'lavie_ind@yahoo.com']

Unnamed: 0,Name,Email,Financial Status,Paid at,Fulfillment Status,Fulfilled at,Accepts Marketing,Currency,Subtotal,Shipping,Taxes,Total,Discount Code,Discount Amount,Shipping Method,Created at,Lineitem quantity,Lineitem name,Lineitem price,Lineitem compare at price,Lineitem sku,Lineitem requires shipping,Lineitem taxable,Lineitem fulfillment status,Billing Name,Billing Street,Billing Address1,Billing Address2,Billing Company,Billing City,Billing Zip,Billing Province,Billing Country,Billing Phone,Shipping Name,Shipping Street,Shipping Address1,Shipping Address2,Shipping Company,Shipping City,Shipping Zip,Shipping Province,Shipping Country,Shipping Phone,Notes,Note Attributes,Cancelled at,Payment Method,Payment Reference,Refunded Amount,Vendor,Outstanding Balance,Employee,Location,Device ID,Id,Tags,Risk Level,Source,Lineitem discount,Phone,discover_first_date,used_discover_already,discover_sales_lead_time,email,timestamp,request.ageRange,request.gender,request.name,request.skinType,SKU,Buy Count


## Explore basic characteristics of Discover data

In [12]:
# examine number of people who retake the test
# Extract only email and timestamp from discover data
email_timestamp_unique = discover_data[['email', 'timestamp']].drop_duplicates(keep='last')
email_timestamp_unique

test_take_pivot_count = pd.pivot_table(email_timestamp_unique, index= 'email', aggfunc='count')
count_total_submissions = len(email_timestamp_unique)
print('total_submissions_count:', count_total_submissions)
retakers_count = test_take_pivot_count[test_take_pivot_count['timestamp'] >1 ].count()
print('retakers_count:', retakers_count)
print('Percentage of retakers: {0:.3f}%'.format(float(retakers_count/count_total_submissions)))

total_submissions_count: 3435
retakers_count: timestamp    486
dtype: int64
Percentage of retakers: 0.141%


In [10]:
discover_data[['timestamp']].drop_duplicates(keep='last').count()

timestamp    3432
dtype: int64

In [14]:
# Male Female Breakdown
email_gender_unique = discover_data[['email', 'request.gender']].drop_duplicates(keep='last')
male_email_count = len(email_gender_unique[email_gender_unique['request.gender'] == 1])
print('male_email_count:', male_email_count)
female_email_count = len(email_gender_unique[email_gender_unique['request.gender'] == 2])
print('female_email_count:', female_email_count)
female_percentage = female_email_count/(male_email_count + female_email_count)
print('male_percentage: {0:.2f}%'.format(1-female_percentage))
print('female_percentage: {0:.2f}%'.format(female_percentage))

male_email_count: 297
female_email_count: 2163
male_percentage: 0.12%
female_percentage: 0.88%


In [12]:
len(email_gender_unique[email_gender_unique['request.gender'] == 0])

134

In [13]:
# Age range distribution
email_age_unique = discover_data[['email', 'request.ageRange']].drop_duplicates(keep='last')
age_range_breakdown = email_age_unique.groupby('request.ageRange').count() #apply(lambda x: 100 * x / x.sum())
age_range_breakdown['Percentage'] =age_range_breakdown.apply(lambda x: 100 * x / x.sum())
age_range_breakdown # age_range_breakdown['Percentage'] = age_range_breakdown['email'].map(lambda x: 100 * x / x.sum())

Unnamed: 0_level_0,email,Percentage
request.ageRange,Unnamed: 1_level_1,Unnamed: 2_level_1
20.0,142,6.454545
30.0,1113,50.590909
40.0,768,34.909091
60.0,128,5.818182
70.0,49,2.227273


In [14]:
age_range_breakdown.sum()

email         2200.0
Percentage     100.0
dtype: float64

In [15]:
# Calculates how many people DID NOT indicate ageRange

len(email_age_unique[pd.isnull(email_age_unique['request.ageRange']) == True])

373

# Part 2 - Match Discover takers with Sales Data
## Objectives:
### 1. Obtain a list SKUs bought right after they have taken Discover
### 2. Match the list with the skin profile (and any other relevant "features")

In [None]:
# Import shopify sales data
sales_data = pd.read_csv("shopify_orders_export_20180207.csv", 
                         low_memory=False, 
                         parse_dates=['Paid at', 'Fulfilled at', 'Created at'])

sales_data_clean = sales_data.drop(sales_data.columns.to_series()[-11:-1], axis=1)

# remove in-store purchases by dropping NA rows in Email column
sales_data_clean.dropna(subset=['Email'], axis=0, inplace=True)

# add columns referencing each customer's FIRST use of discover
sales_data_clean['discover_first_date'] = sales_data_clean['Email'].map(discover_first.set_index('email')['timestamp2'])
sales_data_clean['used_discover_already'] = (sales_data_clean['Created at']> \
                                             sales_data_clean['discover_first_date']).\
                                                map({True: "Used Discover", False: "Not yet"})
sales_data_clean['discover_sales_lead_time'] = sales_data_clean['Created at'] - sales_data_clean['discover_first_date']

# create sales dataset filtering customers who had used discover
# regardless of whether they are existing or completely new customers
post_discover_sales = sales_data_clean[sales_data_clean['used_discover_already'] == "Used Discover"]

# create sales dataset filtering customers who have never used discover and
# and also those customers' purchase histories right before they use discover
pre_discover_sales = sales_data_clean[~(sales_data_clean['used_discover_already'] == "Used Discover")]


In [70]:
df1 = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"],
                    "City":["Seattle","Seattle","Portland","Seattle","Seattle","Portland"],
                   'Value': [0,1,2,3, 4,5]})
df1.pivot_table(index = 'City', columns = 'Name', aggfunc='sum')

Unnamed: 0_level_0,Value,Value,Value
Name,Alice,Bob,Mallory
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Portland,,,7.0
Seattle,0.0,5.0,3.0
