# Data Cleaning
Before we put our data into our model we are going to have to do some tweaks to the data. I am going to do all
the data cleaning and preprocessing in this notebook!

## Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import numpy as np
from PIL import Image
from io import BytesIO
import tensorflow as tf
import ast

In [2]:
## Reading Data
df = pd.read_csv("Products_Updated.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,image,price
0,0,Forum 84 Low 'Off White Brown',https://image.goat.com/transform/v1/attachment...,$107
1,1,Forum Low 'White Black',https://image.goat.com/transform/v1/attachment...,$56
2,2,Forum Low 'White Royal Blue',https://image.goat.com/transform/v1/attachment...,$60
3,3,Forum Low 'Chalk White Gum',https://image.goat.com/transform/v1/attachment...,$73
4,4,M&M's x Forum '84 Low 'Brown',https://image.goat.com/transform/v1/attachment...,$114
5,5,Forum Low 'Dark Witch',https://image.goat.com/transform/v1/attachment...,$119
6,6,Forum 84 LG 'White Clear Sky',https://image.goat.com/transform/v1/attachment...,$52
7,7,atmos x Forum Low 'White Black',https://image.goat.com/transform/v1/attachment...,$136
8,8,Forum 84 High 'No Blood No Foul' Jimmy Jazz Ex...,https://image.goat.com/transform/v1/attachment...,$297
9,9,Forum Low 'Cloud White Red',https://image.goat.com/transform/v1/attachment...,$72


In [3]:
# Removing some placeholders and fixing data types
df = df.dropna()
df = df[~df['image'].str.contains('data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw')]
df = df.drop(df.columns[0], axis=1)
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df

Unnamed: 0,title,image,price
0,Forum 84 Low 'Off White Brown',https://image.goat.com/transform/v1/attachment...,107.0
1,Forum Low 'White Black',https://image.goat.com/transform/v1/attachment...,56.0
2,Forum Low 'White Royal Blue',https://image.goat.com/transform/v1/attachment...,60.0
3,Forum Low 'Chalk White Gum',https://image.goat.com/transform/v1/attachment...,73.0
4,M&M's x Forum '84 Low 'Brown',https://image.goat.com/transform/v1/attachment...,114.0
...,...,...,...
16364,RTFKT x Air Force 1 Low 'Demon',https://image.goat.com/transform/v1/attachment...,932.0
16404,RTFKT x Air Force 1 Low 'Robot',https://image.goat.com/transform/v1/attachment...,1507.0
16405,Air Force 1 Low 'Wear and Tear',https://image.goat.com/transform/v1/attachment...,225.0
16449,Air Force 1 Crater Flyknit 'Wolf Grey' Sample,https://image.goat.com/transform/v1/attachment...,313.0


## Extracting Vectors for our response
Our goal is going to put these prices into a vector but we have to put them into groups before this happens. 
There are two types of the price groups 
1. Unbalanced
 0 - 100
 100 -200
 ...
 1000-inf
2. Balanced
    Quantiles can divide our data up perfectly!
We are going to store each and use our model to see what performs better. 

In [6]:
balanced_df = df.copy()

Code for one hot encoding both groups

In [7]:
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, float('inf')]
labels = ['0-100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800-900', '900+']
df['pg'] = pd.cut(df['price'], bins=bins, labels=labels, right=False)
dummies = pd.get_dummies(df['pg'], prefix='price').astype(int)
df = pd.concat([df, dummies], axis=1)

In [8]:
balanced_df['pg_balanced'], quantile_bins = pd.qcut(balanced_df['price'], 10, retbins=True)
balance_dummies = pd.get_dummies(balanced_df['pg_balanced'], prefix='price').astype(int)
quantile_ranges = [f'({quantile_bins[i]}, {quantile_bins[i+1]})' for i in range(len(quantile_bins)-1)]
balanced_df = pd.concat([balanced_df, balance_dummies], axis=1)
balanced_df
quantile_ranges

['(17.0, 88.0)',
 '(88.0, 107.0)',
 '(107.0, 133.0)',
 '(133.0, 166.0)',
 '(166.0, 206.0)',
 '(206.0, 257.0)',
 '(257.0, 324.0)',
 '(324.0, 434.0)',
 '(434.0, 745.0)',
 '(745.0, 100007.0)']

THESE ONE HOT ENCODED VALUES INTO A NUMPY VECTOR

In [9]:
one_hot_columns = df.columns[df.columns.str.startswith('price_')]
df['UNBALANCED_VECTOR'] = df[one_hot_columns].astype(int).to_numpy().tolist()
df

Unnamed: 0,title,image,price,pg,price_0-100,price_100-200,price_200-300,price_300-400,price_400-500,price_500-600,...,price_100-200.1,price_200-300.1,price_300-400.1,price_400-500.1,price_500-600.1,price_600-700,price_700-800,price_800-900,price_900+,UNBALANCED_VECTOR
0,Forum 84 Low 'Off White Brown',https://image.goat.com/transform/v1/attachment...,107.0,100-200,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Forum Low 'White Black',https://image.goat.com/transform/v1/attachment...,56.0,0-100,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Forum Low 'White Royal Blue',https://image.goat.com/transform/v1/attachment...,60.0,0-100,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Forum Low 'Chalk White Gum',https://image.goat.com/transform/v1/attachment...,73.0,0-100,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,M&M's x Forum '84 Low 'Brown',https://image.goat.com/transform/v1/attachment...,114.0,100-200,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16364,RTFKT x Air Force 1 Low 'Demon',https://image.goat.com/transform/v1/attachment...,932.0,900+,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16404,RTFKT x Air Force 1 Low 'Robot',https://image.goat.com/transform/v1/attachment...,1507.0,900+,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16405,Air Force 1 Low 'Wear and Tear',https://image.goat.com/transform/v1/attachment...,225.0,200-300,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16449,Air Force 1 Crater Flyknit 'Wolf Grey' Sample,https://image.goat.com/transform/v1/attachment...,313.0,300-400,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
one_hot_columns = balanced_df.columns[balanced_df.columns.str.startswith('price_')]
balanced_df['BALANCED_VECTOR'] = balanced_df[one_hot_columns].astype(int).to_numpy().tolist()
balanced_df

Unnamed: 0,title,image,price,pg,price_0-100,price_100-200,price_200-300,price_300-400,price_400-500,price_500-600,...,"price_(88.0, 107.0]","price_(107.0, 133.0]","price_(133.0, 166.0]","price_(166.0, 206.0]","price_(206.0, 257.0]","price_(257.0, 324.0]","price_(324.0, 434.0]","price_(434.0, 745.0]","price_(745.0, 100007.0]",BALANCED_VECTOR
0,Forum 84 Low 'Off White Brown',https://image.goat.com/transform/v1/attachment...,107.0,100-200,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Forum Low 'White Black',https://image.goat.com/transform/v1/attachment...,56.0,0-100,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
2,Forum Low 'White Royal Blue',https://image.goat.com/transform/v1/attachment...,60.0,0-100,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,Forum Low 'Chalk White Gum',https://image.goat.com/transform/v1/attachment...,73.0,0-100,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,M&M's x Forum '84 Low 'Brown',https://image.goat.com/transform/v1/attachment...,114.0,100-200,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16364,RTFKT x Air Force 1 Low 'Demon',https://image.goat.com/transform/v1/attachment...,932.0,900+,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
16404,RTFKT x Air Force 1 Low 'Robot',https://image.goat.com/transform/v1/attachment...,1507.0,900+,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
16405,Air Force 1 Low 'Wear and Tear',https://image.goat.com/transform/v1/attachment...,225.0,200-300,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16449,Air Force 1 Crater Flyknit 'Wolf Grey' Sample,https://image.goat.com/transform/v1/attachment...,313.0,300-400,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Create A subdf holding our contents

In [11]:
df = df.drop_duplicates(subset='image')
final_df = df[['title', 'image', 'price', 'UNBALANCED_VECTOR']]
final_df = final_df.merge(balanced_df[['image', 'BALANCED_VECTOR']], on='image', how='left')
final_df.to_csv("Final_DF.csv")
final_df

Unnamed: 0,title,image,price,UNBALANCED_VECTOR,BALANCED_VECTOR
0,Forum 84 Low 'Off White Brown',https://image.goat.com/transform/v1/attachment...,107.0,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Forum Low 'White Black',https://image.goat.com/transform/v1/attachment...,56.0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
2,Forum Low 'White Royal Blue',https://image.goat.com/transform/v1/attachment...,60.0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,Forum Low 'Chalk White Gum',https://image.goat.com/transform/v1/attachment...,73.0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4,M&M's x Forum '84 Low 'Brown',https://image.goat.com/transform/v1/attachment...,114.0,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...,...,...
12805,RTFKT x Air Force 1 Low 'Demon',https://image.goat.com/transform/v1/attachment...,932.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
12806,RTFKT x Air Force 1 Low 'Robot',https://image.goat.com/transform/v1/attachment...,1507.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
12807,Air Force 1 Low 'Wear and Tear',https://image.goat.com/transform/v1/attachment...,225.0,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12808,Air Force 1 Crater Flyknit 'Wolf Grey' Sample,https://image.goat.com/transform/v1/attachment...,313.0,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# IMAGE PREPROCESSING
    * So reminder this image is a link, and we need to turn it into a tensor of shape (224,224,3)
    * This code is going slow do the nature of downloading images, but that's okay.
    * We can apply other techniques later!
    

## Extracting to binary
I am going to put everything in a tensflow binary file, csvs do not store multi dimensional 
data well, and its going to cause us some problems in the future! These 3 dimenional numpy arrays will be stored as a string instead of a tensor with the csv
while tensorflow binary can store these images very well!

In [14]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _string_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))
    
def serialize_example(image, title, price, unbalanced_vector, balanced_vector):
    feature = {
        'image': _bytes_feature(tf.io.encode_jpeg(image).numpy()),
        'title': _string_feature(title),
        'price': _float_feature([price]),
        'unbalanced_vector': _int64_feature(unbalanced_vector),
        'balanced_vector': _int64_feature(balanced_vector),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [15]:
df.dtypes

title                  object
image                  object
price                 float64
pg                   category
price_0-100             int64
price_100-200           int64
price_200-300           int64
price_300-400           int64
price_400-500           int64
price_500-600           int64
price_600-700           int64
price_700-800           int64
price_800-900           int64
price_900+              int64
price_0-100             int64
price_100-200           int64
price_200-300           int64
price_300-400           int64
price_400-500           int64
price_500-600           int64
price_600-700           int64
price_700-800           int64
price_800-900           int64
price_900+              int64
UNBALANCED_VECTOR      object
dtype: object

In [16]:
i = 0
with tf.io.TFRecordWriter('Downloaded_Images_Binary.tfrecords') as writer:
    for _, row in final_df.iterrows():
        image_url = row['image']
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content))
        image = image.resize((224, 224))
        image = image.convert("RGB")
        resized_image = tf.cast(image, tf.uint8)
        title = str(row['title'])
        if title == "nan":
            continue
        price = row['price']
        unbalanced_vector = row['UNBALANCED_VECTOR']
        balanced_vector = row['BALANCED_VECTOR']
        example = serialize_example(resized_image, title, price, unbalanced_vector, balanced_vector)
        writer.write(example)
        i += 1
        if i % 100 == 0:
            print(i)
        

KeyboardInterrupt: 