In [1]:
import numpy 
import glob

import pandas as pd 
import re

## Define functions

In [2]:
def get_user_chars(row):
    '''
    Input: Named row tuple from dataframe (not a series)
    
    Output: User eye color, hair color, and skin tone (if provided)
    '''    
    # Get specs of user
    user_specs_list= row.user_specs
    
    # Initiate new column values
#     user_eye_color = np.nan
#     user_hair_color = np.nan
#     user_skin_tone = np.nan
    
    # Retrieve eye color 
    if "Eye Color" in user_specs_list:
        eye_color = re.findall("Eye Color [A-Za-z]*", user_specs_list)
        if len(eye_color) == 1:
            user_eye_color = eye_color[0].split(' ')[-1]
        else:
            print('More than one eye color found for a user!')
    else:
        user_eye_color = 'rainbow'
            
    # Retrieve hair color    
    if "Hair color" in user_specs_list:
        hair_color = re.findall("Hair color [A-Za-z]*", user_specs_list)
        if len(hair_color) == 1:
            user_hair_color = hair_color[0].split(' ')[-1]
        else:
            print('More than one hair color found for a user!')
    else:
        user_hair_color = 'invisible'
            
    # Retrieve skin tone    
    if "Skin Tone" in user_specs_list:
        skin_tone = re.findall("Skin Tone [A-Za-z]*", user_specs_list)
        if len(skin_tone) == 1:
            user_skin_tone = skin_tone[0].split(' ')[-1]
        else:
            print('More than one skin tone found for a user!')
    else:
        user_skin_tone = 'reptilian'
            
    return user_skin_tone, user_eye_color, user_hair_color

def trim_color_val(row):
    '''
    Input: Named row tuple from dataframe (not a series)
    
    Output: Reviewed shade name
    ''' 
    # Get review color value
    color_raw = row.review_color
    
    if isinstance(color_raw, str):
        
        if "Color:" in color_raw:
            # Retrieve portion after 'Color:'
            shade_name = color_raw.split('Color:')[1]
            return shade_name
        else:
            print('Non-conforming color string!')
            pass
    elif color_raw == np.nan:
        # Sometimes this happens for products with only a single shade
        # Check if this row has a swatch image link 
        if row.review_color_img != np.nan:
            # Call it RGB for now (will update this when we analyze color swatches)
            shade_name = 'RGB'
            return shade_name
        else:
            print('No shade or image! Manually check {}'.format(row.product_name))
            pass

def trim_star_rating(row):
    '''
    Input: Named row tuple from dataframe (not a series)
    
    Output: Numeric only value
    ''' 
    # Get review star rating
    rating_raw = row.rating
    
    if 'star' in rating_raw:
        # Retrieve string before 'star[s]'
        rating = rating_raw.split(' ')[0]
        # Check if it's a digit
        if rating.isdigit:
            return rating
        else:
            print('No number in rating found!')
            return 
    else:
        print('No star rating found!')
        return 

## Load data and inspect quickly 

In [3]:
# Meta file
blush_meta = pd.read_csv('./data/blush/blush.csv')

In [4]:
blush_files = glob.glob('./data/blush/Blush_*.csv')
len(blush_files)

117

In [5]:
# Add each blush file to list of dataframes
blush_raw = []

for file in blush_files:
    blush_raw.append(pd.read_csv(file))

In [6]:
# concatenate into one dataframe
all_blush_raw = pd.concat(blush_raw)

In [7]:
len(all_blush_raw)

8443

## Edit dataframe contents

### Flesh out user specs

In [8]:
# Initiate empty arrays to eventually be added to dataframe
eye_colors = []
hair_colors = []
skin_tones = []

# Use function defined above to extract and organize user characteristics
for row in all_blush_raw.itertuples(): 
    user_skin_tone, user_eye_color, user_hair_color = get_user_chars(row)
    skin_tones.append(user_skin_tone)
    eye_colors.append(user_eye_color)
    hair_colors.append(user_hair_color)
    

In [9]:
# Attach these new columns to copy of original data frame
blush_df_copy = all_blush_raw.copy()
blush_df_copy['eye_color'] = eye_colors
blush_df_copy['hair_color'] = hair_colors
blush_df_copy['skin_tone'] = skin_tones

In [10]:
# Create new dataframe using our newly added columns and dropping 'user_specs'
blush_df = blush_df_copy.drop('user_specs', axis=1)

In [11]:
# Drop users who did not self-report skin tone
# Disclosing this wasn't always an option for older products
reptilians = len(blush_df[blush_df['skin_tone'] == 'reptilian'])
print('Removing {0:d} out of {1:d} rows!'.format(reptilians, len(blush_df)))
blush_df = blush_df[blush_df['skin_tone'] != 'reptilian']

Removing 627 out of 8443 rows!


In [12]:
# Check it out
blush_df['skin_tone'].value_counts()

Light        2277
Fair         1873
Medium       1667
Olive         550
Porcelain     535
Tan           467
Deep          277
Dark          148
Ebony          22
Name: skin_tone, dtype: int64

### Trim product color values

In [13]:
# initiate empty array to eventually replace 'review_color' values
new_review_colors = []

In [14]:
# use function defined above to extract shade name
for row in blush_df.itertuples():  
    new_review_colors.append(trim_color_val(row))
    
# replace values in existing 'review_color' column
blush_df['review_color'] = new_review_colors

### Trim star ratings

In [15]:
# initiate empty array to eventually replace 'rating' values
new_rating = []

In [16]:
# use function defined above to extract number of stars
for row in blush_df.itertuples():  
    new_rating.append(trim_star_rating(row))
    
# replace values in existing 'rating' column 
blush_df['rating'] = new_rating

In [17]:
blush_df.head()

Unnamed: 0,product_category,brand,product_name,user_id,BIP_level,BIC_level,rating,review_date,verified,review_color,review_color_img,review_title,review_content,no_helpful,no_not_helpful,recommendation,review_product_color,eye_color,hair_color,skin_tone
0,blush,MELT COSMETICS,Melt Cosmetics Blushlight,pearlember,Insider,Rookie,5,23 d ago,yes,Electra,https://www.sephora.com/productimages/sku/s223...,,"Love the pigmentation, love how it lasts all d...",0,0,Recommends this product,,Blue,Brunette,Porcelain
1,blush,MELT COSMETICS,Melt Cosmetics Blushlight,Cleareyesfullbr,Rouge,Rookie,5,7 Feb 2020,no,Lynx,https://www.sephora.com/productimages/sku/s222...,Glowy and pretty!,This is so easy to apply and blend! The formul...,0,0,Recommends this product,,Brown,Brunette,Light
2,blush,MELT COSMETICS,Melt Cosmetics Blushlight,hulkmogan,Rouge,Rookie,5,20 Jan 2020,yes,Sundown,https://www.sephora.com/productimages/sku/s222...,,Gorgeous glowy blush. Great buildable formula....,0,0,Recommends this product,,Brown,Brunette,Deep
3,blush,MELT COSMETICS,Melt Cosmetics Blushlight,Cudsyskeeper,,,4,25 Nov 2019,yes,Sundown,https://www.sephora.com/productimages/sku/s222...,,Great Blush for olive skin. I purchased this s...,1,0,Recommends this product,,Brown,Brunette,Olive
4,blush,MELT COSMETICS,Melt Cosmetics Blushlight,erica0000,VIB,Rookie,5,16 Nov 2019,yes,Nevermore,https://www.sephora.com/productimages/sku/s222...,New favorite blush,My only complaint is that they don't make more...,2,0,Recommends this product,,Brown,Brunette,Medium


In [18]:
# sometimes this extra column slips into scraped data (hopefully fixed!)
try:
    blush_df.drop('review_product_color', axis=1)
except:
    pass

### Replace categorical variables with dummy/indicator variables

In [19]:
# Eye and hair color, skin tone
dummy_eye_color = pd.get_dummies(blush_df['eye_color'])
dummy_skin_tone= pd.get_dummies(blush_df['skin_tone'])
dummy_hair_color = pd.get_dummies(blush_df['hair_color'])

In [20]:
# BIP (Beauty Insider Program Status [a measure of how frequent this user purchases, essentially])
# In decreasing order: ROUGE(>$1000/yr), VIB(>$350/yr), Insider(>$350/yr)
BIP_dict = {'Insider': 0, 'VIB': 1, 'Rouge': 2}

# BIC (Beauty Insider Community Status [a measure of engagement in Sephora's community])
# In decreasing order: Boss, Rookie (I and II)
BIC_dict = {'Rookie': 0, 'Boss':1}

# Can Sephora confirm the user purchased the product through their site?
verified_dict = {'yes': 1, 'no': 0}

# Recommendation (i.e., does this user recommend the product?)
rec_dict= {'Recommends this product': 1, '':0}

In [None]:
# Merge dummy dataframes into existing one
blush_df = blush_df.merge(dummy_eye_color, left_index=True, right_index=True)
blush_df = blush_df.merge(dummy_hair_color, left_index=True, right_index=True)
blush_df = blush_df.merge(dummy_skin_tone, left_index=True, right_index=True)

In [None]:
blush_df.head()