## Clean Up Watch Description Data for Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy

pd.set_option("display.max_columns",None)
%matplotlib inline

"Main" Watch Info

In [2]:
mens = pd.read_csv('watch_page_list_mens.csv')
womens = pd.read_csv('watch_page_list_womens.csv')

Additional Info from Watch's Page

In [3]:
mens_additional = pd.read_csv('additional_stats_mens.csv')
womens_additional = pd.read_csv('additional_stats_womens.csv')

Approx Number of Unique Mens Watches

In [4]:
len(mens['image_url'].unique())

6501

__Explore Product Details__ 

Combine all data into a single DF

In [5]:
mens_combo = mens.merge(mens_additional,how='inner',on='url')
womens_combo = womens.merge(womens_additional,how='inner',on='url')

# Add a Gender Label
mens_combo['gender'] = 'mens'
womens_combo['gender'] = 'womens'

combo = mens_combo.append(womens_combo)

Exclude Listings Without Photo

In [6]:
combo = combo.loc[
    combo['image_url']!='https://www.prestigetime.com/images/watches/pic-unavailable_main.jpg',:]

Exclude "Erotic" Watches with blocked out photos

In [7]:
bad_list= ['PerreletTurbine44mmA40204TURBINEEROTIC',
 'PerreletTurbine44mmA40211TURBINEEROTIC']

combo = combo.loc[combo['image_name'].isin(bad_list) == False,:]

Exclude watches that have the exact same picture as another watch

In [8]:
combo = combo.groupby('image_url').first().reset_index()

### Let's Make Simplified Product Descriptions

#### Create Additional labels

In [9]:
strap_words = ['Alligator & Rubber','Alligator/Crocodile Leather','Calfskin Leather',
              'Fabric','Leather','Ostrich','Python Leather','Rubber','Satin','Stingray']

For Now, white gold and platinum will be considered "stainless steel". Watches with any yellow, or rose gold, will be considered "gold."

In [10]:
gold_words = ['18kt Rose Gold','Stainless Steel & Rose Gold',
             '18kt Yellow Gold','18kt Everose Gold','Gold Plated',
             'Stainless Steel & Plated Rose Gold','Stainless Steel & Plated Yellow Gold',
             'Stainless Steel & Gold','Titanium & Rose Gold',
             '18kt Honey Gold','18kt Rose & White Gold',
             'Ceramic and Gold']

silver_words = ['Stainless Steel',
    'Stainless Steel & Gold',
    'Stainless Steel & Plated Rose Gold',
    'Stainless Steel & Plated Yellow Gold',
    'Stainless Steel & Rose Gold',
    'Stainless Steel & Yellow Gold',
    'Stainless Steel Brushed & Polished',
    'Titanium',
    'Titanium & Platinum',
    'Titanium & Rose Gold',
    'Titanium & Steel',
    'Titanium & White Gold',
    '18kt White Gold',
    'PVD & Steel',
    'Platinum']

In [11]:
combo['is_gold']  = [1 if material in gold_words else 0 for material in combo['Case Material'] ]
combo['is_silver']  = [1 if material in silver_words else 0 for material in combo['Case Material'] ]

Strap or Braclet

In [12]:
combo['is_strap']  = [1 if material in strap_words else 0 for material in combo['Band Material'] ]

Lets do the same for shapes

In [13]:
# Indicator Variables for Shape
square_words = ['square','rectangle','tonneau']
combo['is_square'] = [1 if shape in square_words else 0 for shape in combo['Case Shape']]
combo['is_round'] = [1 if shape == 'Round' else 0 for shape in combo['Case Shape']]

Rotating or Fixed Bezel

In [14]:
fixed_bezel_words = ['None','Fixed']
rotating_bezel_words = ['Bi-Directional Rotating','Uni-Directional Rotating']

combo['is_rotating'] = [
    1 if bezel in rotating_bezel_words else 0 for bezel in combo['Bezel']]

Chronograph

In [15]:
combo['Complications'].fillna('None',inplace=True)

In [16]:
combo['is_chronograph'] = [
1 if 'Chronograph' in complication else 0 for complication in combo['Complications']]

Gender of Watch

In [17]:
combo['is_male'] = [1 if gender == 'mens' else 0 for gender in combo['gender']]

__Make Hashed File Names__

My work around for using "flow_from_directory" in Keras for multilable data requires that the pictures are in order. However, this is problematic because each batch will be highly correlated with itself (file names are alphabetical by brand, brands tend to have consistant designs.

Thus, I will randomize the file names while keeping track of which new names, corresponds to which watch.

In [18]:
chars = list(map(str,list(range(0,10000))))
new_names = list(combo['image_name'])
new_names = [np.random.choice(chars) +'_'+ n for n in new_names]
combo['new_image_name'] = new_names

### Save Combined Data Frame

In [21]:
combo.to_csv('all_watch_info_with_indicators.csv',index = False)