## Recommendation System

In [1]:
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_json('tops_fashion.json')

In [3]:
print('Number of data points:', data.shape[0])
print('Number of features/variables:', data.shape[1])
data.head()

Number of data points: 183138
Number of features/variables: 19


Unnamed: 0,sku,asin,product_type_name,formatted_price,author,color,brand,publisher,availability,reviews,large_image_url,availability_type,small_image_url,editorial_review,title,model,medium_image_url,manufacturer,editorial_reivew
0,,B016I2TS4W,SHIRT,,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Minions Como Superheroes Ironman Women's O Nec...,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,
1,,B01N49AI08,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,
2,,B01JDPCOHO,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,
3,,B01N19U5H5,SHIRT,,,,Focal18,,,"[True, https://www.amazon.com/reviews/iframe?a...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,100% Brand New & Fashion<br> Quantity: 1 Piece...,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,,https://images-na.ssl-images-amazon.com/images...,,
4,,B004GSI2OS,SHIRT,$26.26,,Onyx Black/ Stone,FeatherLite,,Usually ships in 6-10 business days,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,now,https://images-na.ssl-images-amazon.com/images...,,Featherlite Ladies' Long Sleeve Stain Resistan...,,https://images-na.ssl-images-amazon.com/images...,,


In [4]:
data.columns

Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',
       'color', 'brand', 'publisher', 'availability', 'reviews',
       'large_image_url', 'availability_type', 'small_image_url',
       'editorial_review', 'title', 'model', 'medium_image_url',
       'manufacturer', 'editorial_reivew'],
      dtype='object')

In [5]:
data = data[['asin','brand','color','medium_image_url','product_type_name','title','formatted_price',]]

In [6]:
data.shape

(183138, 7)

In [7]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


#### Exploratory Data Analysis

In [8]:
for col in data.columns:
    print('##### Basic stats for feature:', col, '#####')
    print(data[col].describe())
    print(data[col].unique())
    product_type_count = Counter(list(data[col]))
    print(product_type_count.most_common(10))
    print()

##### Basic stats for feature: asin #####
count         183138
unique        183138
top       B074FTYJQC
freq               1
Name: asin, dtype: object
['B016I2TS4W' 'B01N49AI08' 'B01JDPCOHO' ... 'B075756PGC' 'B074L8FVTT'
 'B074FTYJQC']
[('B016I2TS4W', 1), ('B01N49AI08', 1), ('B01JDPCOHO', 1), ('B01N19U5H5', 1), ('B004GSI2OS', 1), ('B00TAEHGGS', 1), ('B012YX2ZPI', 1), ('B074GPTP3J', 1), ('B06Y2LCC5S', 1), ('B00T9W4E66', 1)]

##### Basic stats for feature: brand #####
count     182987
unique     10577
top         Zago
freq         223
Name: brand, dtype: object
['FNC7C' 'FIG Clothing' 'Focal18' ... 'Z' "Rain's Pan Jacket"
 'FFLMYUHULIU']
[('Zago', 223), ('XQS', 222), ('Yayun', 215), ('YUNY', 198), ('XiaoTianXin-women clothes', 193), ('Generic', 192), ('Boohoo', 190), ('Alion', 188), ('Abetteric', 187), ('TheMogan', 187)]

##### Basic stats for feature: color #####
count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object
[None 'Onyx Black/ Stone' 'Grape'

In [9]:
# delete data points with null price and null color values
data = data.loc[~data['formatted_price'].isnull()]
data = data.loc[~data['color'].isnull()]
print('Number of data points after deleting those that have null price or null color:', data.shape[0])

Number of data points after deleting those that have null price or null color: 28385


### Deleting duplicates

In [10]:
# find data points with duplicate titles
print(sum(data.duplicated('title')))

2325


Delete rows with title length shorter than 4

In [11]:
data = data[data['title'].apply(lambda x: 4<len(x.split()))]
print('After removing products with short titles, we have:', data.shape[0])

After removing products with short titles, we have: 27949


Sort data points in descending alphabetical order of title

In [12]:
data.sort_values('title', inplace=True, ascending=False)
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
61973,B06Y1KZ2WB,Éclair,Black/Pink,https://images-na.ssl-images-amazon.com/images...,SHIRT,Éclair Women's Printed Thin Strap Blouse Black...,$24.99
133820,B010RV33VE,xiaoming,Pink,https://images-na.ssl-images-amazon.com/images...,SHIRT,xiaoming Womens Sleeveless Loose Long T-shirts...,$18.19
81461,B01DDSDLNS,xiaoming,White,https://images-na.ssl-images-amazon.com/images...,SHIRT,xiaoming Women's White Long Sleeve Single Brea...,$21.58
75995,B00X5LYO9Y,xiaoming,Red Anchors,https://images-na.ssl-images-amazon.com/images...,SHIRT,xiaoming Stripes Tank Patch/Bear Sleeve Anchor...,$15.91
151570,B00WPJG35K,xiaoming,White,https://images-na.ssl-images-amazon.com/images...,SHIRT,xiaoming Sleeve Sheer Loose Tassel Kimono Woma...,$14.32


Delete rows that have titles with only last 2 words being different

In [13]:
def stage1_dedupe(data):
    import itertools
    
    indices = []
    for i,row in data.iterrows():
        indices.append(i)
        
    stage1_dedupe_asins = []
    i = 0
    j = 0
    n = data.shape[0]
    while i < n and j < n:
        
        prev_i = i

        # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
        i_words = data['title'].loc[indices[i]].split()

        # search for the similar products sequentially 
        j = i+1
        while j < n:

            # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
            j_words = data['title'].loc[indices[j]].split()

            # store the maximum length among the two titles at i and j
            length = max(len(i_words), len(j_words))

            # count is used to store the number of words that are matched in both strings
            count  = 0

            # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
            # example: a =['a', 'b', 'c', 'd']
            # b = ['a', 'b', 'd']
            # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
            for k in itertools.zip_longest(i_words,j_words): 
                if (k[0] == k[1]):
                    count += 1

            # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
            # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
            if (length - count) > 2: # number of words in which both sensences differ
                # if both strings are differ by more than 2 words we include the 1st string index
                stage1_dedupe_asins.append(data['asin'].loc[indices[i]])

                # if the comaprision is between i=n-2 and j=n-1 titles and they differ in more than 2 words we include both, since j will not get a chance to get included in future
                if j == n-1: stage1_dedupe_asins.append(data['asin'].loc[indices[j]])

                # start searching for similar apparrels from first mismatch index, ie j
                i = j
                break
            else:
                j += 1
        if prev_i == i:
            break
    
    data = data.loc[data['asin'].isin(stage1_dedupe_asins)]
    return data

In [14]:
data = stage1_dedupe(data)
print('Number of data points after stage 1 dedupe:', data.shape[0])

Number of data points after stage 1 dedupe: 17593


In [None]:
def stage2_dedupe(data):
    # This code snippet takes significant amount of time: O(n^2)
    import itertools

    indices = []
    for i,row in data.iterrows():
        indices.append(i)

    stage2_dedupe_asins = []
    while len(indices)!=0:
        i = indices.pop()
        stage2_dedupe_asins.append(data['asin'].loc[i])
        # consider the first apparrel's title
        i_words = data['title'].loc[i].split()
        # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
        
        for j in indices:
            
            j_words = data['title'].loc[j].split()
            # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
            
            length = max(len(i_words),len(j_words))
            
            # count is used to store the number of words that are matched in both strings
            count  = 0

            # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
            # example: a =['a', 'b', 'c', 'd']
            # b = ['a', 'b', 'd']
            # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
            for k in itertools.zip_longest(i_words,j_words): 
                if (k[0]==k[1]):
                    count += 1

            # if the number of words in which both strings differ are < 3 , we are considering it as those two apparrels are same, hence we are ignoring them
            if (length - count) < 3:
                indices.remove(j)
    
    data = data.loc[data['asin'].isin(stage2_dedupe_asins)]
    return data

In [16]:
data = stage2_dedupe(data)
print('Number of data points after stage 2 dedupe:', data.shape[0])

Number of data points after stage 2 dedupe: 16462


In [18]:
data.to_pickle('./pickles/stg2_dedupe')

Text Preprocessing