## Basic data cleaning & EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
%matplotlib inline 

In [2]:
# Read in scraped data from poshmark.com
df = pd.read_csv('all_posts.csv', index_col=0)

In [3]:
df.tail(3)

Unnamed: 0,brand,description,item_id,price,status,tags,title,url
11659,eric + ani,White very light weight top. Sparkly pear grap...,5c48fdee9539f7b6b5c3e2db,"['$5', '$25']",Buy Now,"['Women', 'Tops', 'Tees - Short Sleeve']",Eric + ani pear sparkly top,https://poshmark.com/listing/Eric-ani-pear-spa...
11660,BKE,33X33 1/2,5c48fe7a409c15e95d761881,"['$30', '$45']",Buy Now,"['Women', 'Jeans', 'Boot Cut']",BKE Payton Jeans,https://poshmark.com/listing/BKE-Payton-Jeans-...
11661,Nike,NEW WOMENS NIKE AIR MAX 95’ SE GLITTER,5c48fe4003087c8aaef53fab,"['$75', '$170']",Buy Now,"['Women', 'Shoes', 'Sneakers']",NEW WOMENS NIKE AIR MAX 95’ SE GLITTER,https://poshmark.com/listing/NEW-WOMENS-NIKE-A...


In [4]:
# Check for duplicate rows
df.duplicated(subset=['item_id']).sum()

1

In [5]:
# Find which rows are duplicated
dup = df[df.duplicated(subset=['item_id']) == True].index[0]

In [6]:
# Drop any null rows
df.drop(index=dup, inplace=True)

# Now that rows have been deleted, reset the index
df.reset_index(inplace=True, drop=True)

In [7]:
# Check for nulls using item_id column
df['item_id'].isnull().sum()

1

In [8]:
# Find which rows are null
ind = df[df['item_id'].isnull()].index[0]

In [9]:
# Drop any null rows
df.drop(index=ind, inplace=True)

# Now that rows have been deleted, reset the index
df.reset_index(inplace=True, drop=True)

In [10]:
try:
    ind = df[df['price'].isnull()].index[0]
except:
    pass

In [11]:
# Drop any null rows
df.drop(index=ind, inplace=True)

# Now that rows have been deleted, reset the index
df.reset_index(inplace=True, drop=True)

#### Clean up `price` column

In [12]:
# Clean up 'price' column by removing extraneous characters in string
# Split 'price' into a list of 2 strings; original and new price
for i in range(len(df)):
    df['price'][i] = df['price'][i].replace("'", "").replace('$', '').replace('[', '').replace(']', '').replace(' ', '').split(',')
    

In [13]:
# Create empty DataFrame (df2) with identical index to main DataFrame (df)
df2 = pd.DataFrame(columns=['original_price', 'new_price'], 
                   index=df.index)

In [14]:
# Set df2 columns as new_price and original_price
for i in range(len(df)):
    df2['new_price'][i] = df['price'][i][0]
    df2['original_price'][i] = df['price'][i][1]

In [15]:
df2.head(3)

Unnamed: 0,original_price,new_price
0,50,25
1,60,25
2,0,25


In [16]:
# Concatenate df and df2
df = pd.concat([df, df2], axis=1)

In [17]:
df.head(3)

Unnamed: 0,brand,description,item_id,price,status,tags,title,url,original_price,new_price
0,,2 piece green jogger,5c7d94c7c6177723404ae99e,"[25, 50]",Buy Now,"['Women', 'Other']",Green jogger set,https://poshmark.com/listing/Green-jogger-set-...,50,25
1,Juicy Couture,"Short sleeve, teal with silver Juicy Crest and...",5c7d94202beb797535287264,"[25, 60]",Buy Now,"['Women', 'Tops', 'Tees - Short Sleeve']",Sz L Juicy Shirt Teal Silver SS Bling,https://poshmark.com/listing/Sz-L-Juicy-Shirt-...,60,25
2,Jennifer Lopez,Jennifer Lopez boyfriend jeans\n•size 6\n•new ...,5c7d944d03087c02ed82905b,"[25, 0]",Buy Now,"['Women', 'Jeans', 'Boyfriend']",Jennifer Lopez | NWOT Blue boyfriend jeans,https://poshmark.com/listing/Jennifer-Lopez-NW...,0,25


In [18]:
# Check datatypes
df.dtypes

brand             object
description       object
item_id           object
price             object
status            object
tags              object
title             object
url               object
original_price    object
new_price         object
dtype: object

In [19]:
# Set price columns as floats instead of strings
df['original_price'] = df['original_price'].astype(float)
df['new_price'] = df['new_price'].astype(float)

In [20]:
# Check datatypes again
df.dtypes

brand              object
description        object
item_id            object
price              object
status             object
tags               object
title              object
url                object
original_price    float64
new_price         float64
dtype: object

In [21]:
# Drop old 'price' column
df.drop(columns=['price'], inplace=True)

In [22]:
df.head(3)

Unnamed: 0,brand,description,item_id,status,tags,title,url,original_price,new_price
0,,2 piece green jogger,5c7d94c7c6177723404ae99e,Buy Now,"['Women', 'Other']",Green jogger set,https://poshmark.com/listing/Green-jogger-set-...,50.0,25.0
1,Juicy Couture,"Short sleeve, teal with silver Juicy Crest and...",5c7d94202beb797535287264,Buy Now,"['Women', 'Tops', 'Tees - Short Sleeve']",Sz L Juicy Shirt Teal Silver SS Bling,https://poshmark.com/listing/Sz-L-Juicy-Shirt-...,60.0,25.0
2,Jennifer Lopez,Jennifer Lopez boyfriend jeans\n•size 6\n•new ...,5c7d944d03087c02ed82905b,Buy Now,"['Women', 'Jeans', 'Boyfriend']",Jennifer Lopez | NWOT Blue boyfriend jeans,https://poshmark.com/listing/Jennifer-Lopez-NW...,0.0,25.0


In [23]:
df[['original_price', 'new_price']].describe()

Unnamed: 0,original_price,new_price
count,11659.0,11659.0
mean,401.485376,48.740973
std,13519.315561,144.630096
min,0.0,3.0
25%,0.0,15.0
50%,40.0,25.0
75%,88.5,40.0
max,1000000.0,4950.0


In [24]:
# How many 'original_price' are $0?
len(df[df['original_price'] == 0].sort_values(by='new_price'))

2975

In [25]:
# How many observations are viable when we erase all observations where `original_price` = 0?
len(df) - len(df[df['original_price'] == 0].sort_values(by='new_price'))


8684

In [26]:
# All data, minus all rows where 'original_price' == 0
no_zero = df[df['original_price'] != 0]

In [27]:
no_zero.describe()

Unnamed: 0,original_price,new_price
count,8684.0,8684.0
mean,539.027867,52.348342
std,15662.684775,149.893416
min,1.0,3.0
25%,35.0,15.0
50%,60.0,25.0
75%,110.0,45.0
max,1000000.0,4500.0


In [28]:
# Find rows where original_price > 500 and sort from highest to lowest AS
df[df['original_price'] > 500].sort_values(by='original_price',
                                        ascending=False)

Unnamed: 0,brand,description,item_id,status,tags,title,url,original_price,new_price
1923,,The length of the shirt from the top of the ba...,5c75b30a3c9844744d367de7,Buy Now,"['Women', 'Tops']",Black&White Lace Top,https://poshmark.com/listing/BlackWhite-Lace-T...,1000000.0,5.0
5956,biviel anthropologie,Good used condition,5c579d25c61777dfc8c9b9c2,Buy Now,"['Women', 'Shoes', 'Sandals']",Biviel Anthropologie tan sandals sz 40 9.5 suede,https://poshmark.com/listing/Biviel-Anthropolo...,999999.0,39.0
11268,Violet & Claire,Ruffled in the front\nLightweight,5c4a0da45a9d21191850cbb9,Buy Now,"['Women', 'Tops', 'Blouses']",Violet + Claire Lightweight Green Blouse,https://poshmark.com/listing/Violet-Claire-Lig...,123456.0,15.0
9551,Reebok,Used shoes. Still have lots of life in them. S...,5c4d1ff22e147856ab3288c1,Buy Now,"['Women', 'Shoes', 'Athletic Shoes']",Reebok Runtone Shoes and Tshirt,https://poshmark.com/listing/Reebok-Runtone-Sh...,100000.0,25.0
7942,,Happy to negotiate! \nI accept reasonable offe...,5c50c0fd3e0caa104d8727f0,Buy Now,"['Women', 'Tops']",Like it! Get it!🛍🥰,https://poshmark.com/listing/Like-it-Get-it-5c...,100000.0,1000.0
3423,Mudd,Gray and white sweater by Mudd. Size large. Sh...,5c5db381819e90c03320dae7,Buy Now,"['Women', 'Sweaters', 'Crew & Scoop Necks']",Gray & white cowl neck sweater by mudd. Size l...,https://poshmark.com/listing/Gray-white-cowl-n...,99999.0,28.0
5108,Kut from the Kloth,Gently pre-owned. No holes or stains. EUC!,5c59c254f63eea67f467e14a,Buy Now,"['Women', 'Shorts', 'Bermudas']",Kut from the Kloth Jean Shorts Bermuda Denim Med,https://poshmark.com/listing/Kut-from-the-Klot...,99999.0,14.0
1601,Staring at Stars,Gently pre-owned. No holes or stains. EUC!,5c762941c617772eef406ebc,Buy Now,"['Women', 'Tops', 'Tunics']",Staring at Stars Urban Outfitters Button Flowy...,https://poshmark.com/listing/Staring-at-Stars-...,99999.0,14.0
1194,,Razor Back Leopard Tank\nLike New Condition\n9...,5c7734ae534ef96b4fec46f0,Buy Now,"['Women', 'Tops', 'Tank Tops']",Razor Back Leopard Tank,https://poshmark.com/listing/Razor-Back-Leopar...,99999.0,15.0
1111,,Handwoven Wool panels in classic Zapotec style...,5c785205a8399e66385709dd,Buy Now,"['Women', 'Bags', 'Shoulder Bags']",Saddle Blanket Purse,https://poshmark.com/listing/Saddle-Blanket-Pu...,99999.0,45.0


#### Clean up `tags` column

In [29]:
# Look at tags column structure
df['tags'][3]

"['Women', 'Tops', 'Sweatshirts & Hoodies']"

In [30]:
# Clean up 'tags' column by removing extraneous characters
# Split 'tags' into a list of strings; each string a different tag
for i in range(len(df)):
    df['tags'][i] = df['tags'][i].replace('[', '').replace(']', '').replace(' ', '').replace("'", "").split(',')
    
    if i % 1000 == 0: # Show progress of cleaning
        print(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


In [31]:
df['tags'][1]

['Women', 'Tops', 'Tees-ShortSleeve']

In [32]:
# Create empty DataFrame (df3) with identical index to main DataFrame (df)
df3 = pd.DataFrame(columns=['tag_1', 'tag_2', 'tag_3'], index=df.index)

In [33]:
# Split 'tags' column into three separate columns
for i in range(len(df)):
    df3['tag_1'][i] = str(df['tags'][i][0])
    
    # There isn't always a 2nd and 3rd tag, they're left null
    try:
        df3['tag_2'][i] = str(df['tags'][i][1])
        df3['tag_3'][i] = str(df['tags'][i][2])
    
    except:
        df3['tag_2'][i] = None
        df3['tag_3'][i] = None

In [34]:
# Concatenate df and df3
df = pd.concat([df, df3], axis=1)

In [35]:
# Drop old 'tags' column
df.drop(columns=['tags'], inplace=True)

In [36]:
df.head(3)

Unnamed: 0,brand,description,item_id,status,title,url,original_price,new_price,tag_1,tag_2,tag_3
0,,2 piece green jogger,5c7d94c7c6177723404ae99e,Buy Now,Green jogger set,https://poshmark.com/listing/Green-jogger-set-...,50.0,25.0,Women,,
1,Juicy Couture,"Short sleeve, teal with silver Juicy Crest and...",5c7d94202beb797535287264,Buy Now,Sz L Juicy Shirt Teal Silver SS Bling,https://poshmark.com/listing/Sz-L-Juicy-Shirt-...,60.0,25.0,Women,Tops,Tees-ShortSleeve
2,Jennifer Lopez,Jennifer Lopez boyfriend jeans\n•size 6\n•new ...,5c7d944d03087c02ed82905b,Buy Now,Jennifer Lopez | NWOT Blue boyfriend jeans,https://poshmark.com/listing/Jennifer-Lopez-NW...,0.0,25.0,Women,Jeans,Boyfriend


In [37]:
# Check dtypes again
df.dtypes

brand              object
description        object
item_id            object
status             object
title              object
url                object
original_price    float64
new_price         float64
tag_1              object
tag_2              object
tag_3              object
dtype: object

In [38]:
# # See how many posts are in each specific category by tag
# len(df[df['tag_2'] == 'Tops'])

In [39]:
# # Total # of unique tag_2 tags
# set(df['tag_2'])

In [40]:
# # Total # of unique tag_3 tags
# set(df['tag_3'])

#### Look at `brand` column

In [41]:
len(set(df['brand']))

2652

In [42]:
# Value count by brand
brand_counts = pd.DataFrame(df['brand'].value_counts())

In [43]:
# Top 10 brands based on post count
brand_counts[brand_counts['brand'] > 20].head(10)

Unnamed: 0,brand
Forever 21,203
Nike,203
lululemon athletica,190
American Eagle Outfitters,178
Free People,176
LuLaRoe,175
J. Crew,157
PINK Victoria's Secret,156
Victoria's Secret,154
Anthropologie,130


In [44]:
df.tail(3)

Unnamed: 0,brand,description,item_id,status,title,url,original_price,new_price,tag_1,tag_2,tag_3
11656,eric + ani,White very light weight top. Sparkly pear grap...,5c48fdee9539f7b6b5c3e2db,Buy Now,Eric + ani pear sparkly top,https://poshmark.com/listing/Eric-ani-pear-spa...,25.0,5.0,Women,Tops,Tees-ShortSleeve
11657,BKE,33X33 1/2,5c48fe7a409c15e95d761881,Buy Now,BKE Payton Jeans,https://poshmark.com/listing/BKE-Payton-Jeans-...,45.0,30.0,Women,Jeans,BootCut
11658,Nike,NEW WOMENS NIKE AIR MAX 95’ SE GLITTER,5c48fe4003087c8aaef53fab,Buy Now,NEW WOMENS NIKE AIR MAX 95’ SE GLITTER,https://poshmark.com/listing/NEW-WOMENS-NIKE-A...,170.0,75.0,Women,Shoes,Sneakers


In [45]:
# df.to_csv('./clean_data_1.csv')