# Data Preprocessing and Data Wrangling

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
import math
import re
import os
import seaborn as sns
from PIL import Image
import requests
from io import BytesIO

In [2]:
data=pd.read_pickle('pickels/180k_apparel_data')

In [3]:
data.describe()

Unnamed: 0,asin,brand,color,product_type_name,medium_image_url,title,formatted_price
count,183138,182987,64956,183138,183138,183138,28395
unique,183138,10577,7380,72,170782,175985,3135
top,B0744C6H92,Zago,Black,SHIRT,https://images-na.ssl-images-amazon.com/images...,Nakoda Cotton Self Print Straight Kurti For Women,$19.99
freq,1,223,13207,167794,23,77,945


In [4]:
# consider products which have price information
# data['Column_name'].isnull() => gives the information
# about the dataframe row's which have null values price == None|Null
data = data.loc[~data['color'].isnull()]
print('Number of data points After eliminating Color=NULL :', data.shape[0])
data = data.loc[~data['brand'].isnull()]
print('Number of data points After eliminating Brand=NULL :', data.shape[0])
data = data.loc[~data['product_type_name'].isnull()]
print('Number of data points After eliminating Product_Type_Name=NULL :', data.shape[0])

Number of data points After eliminating Color=NULL : 64956
Number of data points After eliminating Brand=NULL : 64843
Number of data points After eliminating Product_Type_Name=NULL : 64843


In [5]:
data.to_pickle('pickels/64k_apparel_data')

## Removing Duplicates From Title

In [6]:
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print("After removal of products with short description:", data_sorted.shape[0])

After removal of products with short description: 63263


In [7]:
data_sorted.sort_values('title',inplace=True, ascending=False)
data_sorted.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,asin,brand,color,product_type_name,medium_image_url,title,formatted_price
27547,B073W7P8KK,Nation LTD,Blue,DRESS,https://images-na.ssl-images-amazon.com/images...,❀Nation Women Stripe Blouse Long Sleeve Shirt ...,
31277,B01M0PWMZ8,Anglin,White,SHIRT,https://images-na.ssl-images-amazon.com/images...,✽ANGLIN✽ Women Striped Floral Long Sleeve Roun...,
30453,B01M02GWRG,Anglin,White,SHIRT,https://images-na.ssl-images-amazon.com/images...,✽ANGLIN✽ Women Striped Floral Long Sleeve Roun...,
32485,B01N0ADXM0,Anglin,Red,SHIRT,https://images-na.ssl-images-amazon.com/images...,✽ANGLIN✽ Women Fashion Stripe Dress Round Coll...,
26767,B01MTQAU86,Anglin,Black,SHIRT,https://images-na.ssl-images-amazon.com/images...,✽ANGLIN✽ Women Autumn Winter Christmas Printin...,


#### Some examples of dupliacte titles that differ only in the last few words.
<pre>
Titles 1:
16. woman's place is in the house and the senate shirts for Womens XXL White
17. woman's place is in the house and the senate shirts for Womens M Grey

Title 2:
25. tokidoki The Queen of Diamonds Women's Shirt X-Large
26. tokidoki The Queen of Diamonds Women's Shirt Small
27. tokidoki The Queen of Diamonds Women's Shirt Large

Title 3:
61. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
62. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
63. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
64. psychedelic colorful Howling Galaxy Wolf T-shirt/Colorful Rainbow Animal Print Head Shirt for woman Neon Wolf t-shirt
</pre>

In [8]:
indices = []
for i,row in data_sorted.iterrows():
    indices.append(i)

In [9]:
%%time
import itertools
stage1_dedupe_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
    
    previous_i = i

    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
    a = data['title'].loc[indices[i]].split()

    # search for the similar products sequentially 
    j = i+1
    while j < num_data_points:

        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
        b = data['title'].loc[indices[j]].split()

        # store the maximum length of two strings
        length = max(len(a), len(b))

        # count is used to store the number of words that are matched in both strings
        count  = 0

        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
        # example: a =['a', 'b', 'c', 'd']
        # b = ['a', 'b', 'd']
        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
        for k in itertools.zip_longest(a,b): 
            if (k[0] == k[1]):
                count += 1

        # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
        # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
        if (length - count) > 2: # number of words in which both sensences differ
            # if both strings are differ by more than 2 words we include the 1st string index
            stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])

            # if the comaprision between is between num_data_points, num_data_points-1 strings and they differ in more than 2 words we include both
            if j == num_data_points-1: stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])

            # start searching for similar apperals corresponds 2nd string
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break

Wall time: 6.63 s


In [10]:
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]
print('Number of data points : ', data.shape[0])

Number of data points :  48722


#### We removed  the dupliactes which differ only at the end.

In [11]:
data.to_pickle('pickels/48k_apperal_data')

#### [5.2.3] Remove duplicates : Part 2
<pre>

In the previous cell, we sorted whole data in alphabetical order of  titles.Then, we removed titles which are adjacent and very similar title

But there are some products whose titles are not adjacent but very similar.

Examples:

Titles-1
86261.  UltraClub Women's Classic Wrinkle-Free Long Sleeve Oxford Shirt, Pink, XX-Large
115042. UltraClub Ladies Classic Wrinkle-Free Long-Sleeve Oxford Light Blue XXL

TItles-2
75004.  EVALY Women's Cool University Of UTAH 3/4 Sleeve Raglan Tee
109225. EVALY Women's Unique University Of UTAH 3/4 Sleeve Raglan Tees
120832. EVALY Women's New University Of UTAH 3/4-Sleeve Raglan Tshirt

</pre>

### Downloading Images

In [22]:
def find(A, B):
    # count
    count = {}
    no=0
    # insert in A
    for word in A.split():
        count[word] = count.get(word, 0) + 1
    print(count)
    # insert in B
    for word in B.split():
        count[word] = count.get(word, 0) + 1
        if count[word] == 2 :
            no=no+1
    print(count)
    print(no)
    # return ans
    return [word for word in count if count[word] == 2]
# main
A = "EVALY Women's Cool University Of UTAH 3/4 Sleeve Raglan Tee "
B = "EVALY EVALY EVALY EVALY EVALY Women's New University Of UTAH 3/4-Sleeve Raglan Tshirt"
print("The uncommon words in strings are:",find(A, B))

{'EVALY': 1, "Women's": 1, 'Cool': 1, 'University': 1, 'Of': 1, 'UTAH': 1, '3/4': 1, 'Sleeve': 1, 'Raglan': 1, 'Tee': 1}
{'EVALY': 6, "Women's": 2, 'Cool': 1, 'University': 2, 'Of': 2, 'UTAH': 2, '3/4': 1, 'Sleeve': 1, 'Raglan': 2, 'Tee': 1, 'New': 1, '3/4-Sleeve': 1, 'Tshirt': 1}
6
The uncommon words in strings are: ["Women's", 'University', 'Of', 'UTAH', 'Raglan']
