# Part 1 - Obtaining and formatting Discover Data

In [1]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_columns", 100)

In [2]:
# import discover data, and filter for list of emails according to their earliest use of discover
discover_data = pd.read_csv("discover_print_to_csv.csv")
discover_data['timestamp'] = discover_data['timestamp'].map(pd.to_datetime)

# In request.concerns column, both sensitive skin status and skin types are appended together with user's concerns.
# Need to use pandas apply to remove those entries

# However, we need to first convert each element in the request.concerns from "strings (of list)" to "list" type 

# In order to do that, we need to fill blank entries with an empty list in order for the function ast.literal_eval() 
# to be passed through successfully

columns_to_convert_list = ['concerns_addressed','match','request.concerns','reviews']

for column in columns_to_convert_list:
    
    discover_data[column] = discover_data[column].fillna("[]").map(lambda x: ast.literal_eval(x))

def remove_skin_types(x):
    terms_to_delete = ['Sensitivity', 'Combination','Dry to Very Dry', 'Normal', 'Oily']
    replacement_list = []
    for entry in x:
        if entry in terms_to_delete:
            del entry
        else:
            replacement_list.append(entry)
    return replacement_list
discover_data['request.concerns'] = discover_data['request.concerns'].map(remove_skin_types)

# remove

# print(type(discover_data['timestamp'][0]))
# discover_data.to_csv('firebase_discover_formatted.csv')
# discover_data['request.gender'].value_counts()
discover_data.head()

Unnamed: 0,Serial,Name,SKU,auto_include,concerns_addressed,concerns_addressed_count,email,handle,match,min_req,rating,request.ageRange,request.concerns,request.gender,request.name,request.sensitivity,request.skinType,reviews,timestamp,type
0,0,Earth Sourced Gentle Cleansing Gel,8500,,[Sensitivity],1,yaminnphyu@mail.com,earth-sourced-perfectly-natural-cleansing-gel,[Sensitivity],['Sensitivity'],95,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 15, 12, 11, 31, 96]",2017-09-10 03:27:26,Cleanser
1,1,Earth Sourced Purely Natural Refreshing Toner,8510,,"[Dehydration, Sensitivity]",1,yaminnphyu@mail.com,earth-sourced-purely-natural-refreshing-toner,[Sensitivity],['Sensitivity'],83,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 9, 5, 7, 22, 76]",2017-09-10 03:27:26,Toner
2,2,Clear Regular Strength Anti-Redness Exfoliatin...,6200,,"[Clogged Pores, Uneven Texture, Enlarged Pores...",5,yaminnphyu@mail.com,clear-regular-strength-anti-redness-exfoliatin...,"[Enlarged Pores, PIH, Redness, Sensitivity, Acne]",['Acne'],85,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 16, 20, 23, 47, 214]",2017-09-10 03:27:26,Exfoliant
3,3,Resist Ultra-Light Super Antioxidant Concentra...,7740,,"[Dullness, Wrinkles, PIH, Enlarged Pores, Unev...",8,yaminnphyu@mail.com,resist-ultra-light-super-antioxidant-concentra...,"[Dullness, PIH, Enlarged Pores, Oily, Dullness...","['Dehydration', 'Clogged Pores', 'Enlarged Por...",79,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 20, 27, 23, 37, 248]",2017-09-10 03:27:26,Serum
4,4,Resist C15 Super Booster,7770,,"[Dehydration, PIH, Redness, Uneven Texture, Wr...",4,yaminnphyu@mail.com,resist-c15-super-booster,"[PIH, Redness, Dullness, Sun Damage]",['None'],74,30.0,"[Enlarged Pores, Acne, Sun Damage, Redness, Du...",2,Yaminn Phyu,True,Oily,"[None, 53, 60, 65, 84, 427]",2017-09-10 03:27:26,Booster


## Explore basic characteristics of Discover data

In [5]:
discover_data['request.gender'].value_counts()

2    35518
1     4990
0     1963
Name: request.gender, dtype: int64

In [12]:
# examine number of people who retake the test

email_timestamp_unique = discover_data[['email', 'timestamp']].drop_duplicates(keep='last')
email_timestamp_unique

test_take_pivot_count = pd.pivot_table(email_timestamp_unique, index= 'email', aggfunc='count')
count_total_submissions = len(email_timestamp_unique)
print('total_submissions_count:', count_total_submissions)
retakers_count = test_take_pivot_count[test_take_pivot_count['timestamp'] >1 ].count()
print('retakers_count:', retakers_count)
print('Percentage of retakers: {0:.3f}%'.format(float(retakers_count/count_total_submissions)))

total_submissions_count: 3435
retakers_count: timestamp    486
dtype: int64
Percentage of retakers: 0.141%


In [10]:
discover_data[['timestamp']].drop_duplicates(keep='last').count()

timestamp    3432
dtype: int64

In [14]:
# Male Female Breakdown
email_gender_unique = discover_data[['email', 'request.gender']].drop_duplicates(keep='last')
male_email_count = len(email_gender_unique[email_gender_unique['request.gender'] == 1])
print('male_email_count:', male_email_count)
female_email_count = len(email_gender_unique[email_gender_unique['request.gender'] == 2])
print('female_email_count:', female_email_count)
female_percentage = female_email_count/(male_email_count + female_email_count)
print('male_percentage: {0:.2f}%'.format(1-female_percentage))
print('female_percentage: {0:.2f}%'.format(female_percentage))

male_email_count: 297
female_email_count: 2163
male_percentage: 0.12%
female_percentage: 0.88%


In [12]:
len(email_gender_unique[email_gender_unique['request.gender'] == 0])

134

In [13]:
# Age range distribution
email_age_unique = discover_data[['email', 'request.ageRange']].drop_duplicates(keep='last')
age_range_breakdown = email_age_unique.groupby('request.ageRange').count() #apply(lambda x: 100 * x / x.sum())
age_range_breakdown['Percentage'] =age_range_breakdown.apply(lambda x: 100 * x / x.sum())
age_range_breakdown # age_range_breakdown['Percentage'] = age_range_breakdown['email'].map(lambda x: 100 * x / x.sum())

Unnamed: 0_level_0,email,Percentage
request.ageRange,Unnamed: 1_level_1,Unnamed: 2_level_1
20.0,142,6.454545
30.0,1113,50.590909
40.0,768,34.909091
60.0,128,5.818182
70.0,49,2.227273


In [14]:
age_range_breakdown.sum()

email         2200.0
Percentage     100.0
dtype: float64

In [15]:
# Calculates how many people DID NOT indicate ageRange

len(email_age_unique[pd.isnull(email_age_unique['request.ageRange']) == True])

373