*A document in progress showing my scraper development.*

### Import modules
*(Always do this step before running any other code chunks)*

In [1]:
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import certifi
import urllib3
import pandas as pd 
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## All listings (brands = lululemon, Reformation, Sezane, Doen, Realisation Par, Iro, Ulla Johnson, Zimmermann, The Row, Rouje)

Here, I specify the URL for **sold/completed** listings of 10 brands of women's clothing. Reminder to check the [robot.txt](https://poshmark.com/robots.txt) for the site you want to scrape- don't get blocked as a bot!

I'm also identifying the page for BeautifulSoup.

#### Change the object below to correspond to the brand page I want to scrape...

In [46]:
#brand = 'lululemon'
#brand = 'reformation'
#brand = 'sezane'
#brand = 'doen'
#brand = 'realisation_par'
#brand = 'iro'
#brand = 'ulla_johnson'
#brand = 'zimmermann'
#brand = 'the_row'
brand = 'rouje'

In [47]:
urlpage = 'https://poshmark.com/brand/' +str(brand) +'-Women?availability=sold_out'
print(urlpage)

https://poshmark.com/brand/rouje-Women?availability=sold_out


In [48]:
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
r = http.request('GET', urlpage)
page = urllib.request.urlopen(urlpage).read()
soup = BeautifulSoup(page, 'html.parser')
# print(page)

In [49]:
item_containers = soup.find_all('div', {'class': 'tile'})
print(len(item_containers)) # should be about 4 dozen
#item_containers[0]

48


### Running the scraper
On this site, there are 48 listings per page. We are starting to scrape at page 1; I'm retrieving 1) title/item details in a messy HTML chunk, to be cleaned later; 2) price.

In [50]:
page_num = 1

# Create lists to store the scraped information
summary = []
price = []
size = []

while page_num<=48: # I checked- 48 pages, 48 items per page
    html = requests.get(urlpage.format(page_num)).text
    soup = BeautifulSoup(html, 'html.parser')
    
    for d in soup.findAll("div",{"class":"tile"}):
        title = d.a.get("title")
        pr = d.get("data-post-price")
        si = d.get("data-post-size")
    
        summary.append(title)
        price.append(pr)
        size.append(si)
     
    page_num=page_num+1

# Check the results
print(len(summary))
print(len(price))

2304
2304


### Storing the data in a pandas dataframe

Change the text below to correspond to the brand I just scraped. Note that I'm using all lowercase to simplify PostgreSQL ingestion (it *does not like* Capital Letters).

In [51]:
pm_rouje_df = pd.DataFrame({'summary': summary, 'price': price, 'size': size})
print(pm_rouje_df.info())
pm_rouje_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2304 entries, 0 to 2303
Data columns (total 3 columns):
summary    2304 non-null object
price      2304 non-null object
size       2304 non-null object
dtypes: object(3)
memory usage: 54.1+ KB
None


Unnamed: 0,summary,price,size
0,NWOT Rouje gabin dress red,$150,S
1,Rouje dress,$71,00
2,NWOT rouje gabin dress navy,$120,S
3,Round Gabin Dress,$175,French 36/US 4
4,Red “Elisa” Rouje Blouse,$50,2


## Cleaning the data
Here is a list of my dfs (lululemon and Reformation brands were part of the proof-of-concept for this scraper, so lines of code corresponding to those brands are commented out).

In [None]:
'''
#print(pm_lulu_df.head())
#print(pm_ref_df.head())
print(pm_sezane_df.head())
print(pm_doen_df.head())
print(pm_realisationpar_df.head())
print(pm_iro_df.head())
print(pm_ullajohnson_df.head())
print(pm_zimmermann_df.head())
print(pm_therow_df.head())
print(pm_rouje_df.head())

'''

In [121]:
print(pm_sezane_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2304 entries, 0 to 2303
Data columns (total 7 columns):
summary      2304 non-null object
price        2304 non-null int64
size         2304 non-null object
brand        2304 non-null object
condition    2304 non-null object
category     2304 non-null object
site         2304 non-null object
dtypes: int64(1), object(6)
memory usage: 126.1+ KB
None


### Adding new columns: brand, condition, category

In [64]:
#pm_lululemon_df['brand'] = 'lululemon'
#pm_ref_df['brand'] = 'reformation'
pm_sezane_df['brand'] = 'sezane'
pm_doen_df['brand'] = 'doen'
pm_realisationpar_df['brand'] = 'realisationpar'
pm_iro_df['brand'] = 'iro'
pm_ullajohnson_df['brand'] = 'ullajohnson'
pm_zimmermann_df['brand'] = 'zimmermann'
pm_therow_df['brand'] = 'therow'
pm_rouje_df['brand'] = 'rouje'


#pm_lululemon_df['condition'] = ''
#pm_ref_df['condition'] = ''
pm_sezane_df['condition'] = ''
pm_doen_df['condition'] = ''
pm_realisationpar_df['condition'] = ''
pm_iro_df['condition'] = ''
pm_ullajohnson_df['condition'] = ''
pm_zimmermann_df['condition'] = ''
pm_therow_df['condition'] = ''
pm_rouje_df['condition'] = ''


#pm_lululemon_df['category'] = ''
#pm_ref_df['category'] = ''
pm_sezane_df['category'] = ''
pm_doen_df['category'] = ''
pm_realisationpar_df['category'] = ''
pm_iro_df['category'] = ''
pm_ullajohnson_df['category'] = ''
pm_zimmermann_df['category'] = ''
pm_therow_df['category'] = ''
pm_rouje_df['category'] = ''

#### Convert all 'summary' strings to lowercase

In [65]:
#pm_ref_df['summary']= pm_ref_df['summary'].str.lower()
#pm_lululemon_df['summary'] = pm_lululemon_df['summary'].str.lower()
pm_sezane_df['summary'] = pm_sezane_df['summary'].str.lower()
pm_doen_df['summary'] = pm_doen_df['summary'].str.lower()
pm_realisationpar_df['summary'] = pm_realisationpar_df['summary'].str.lower()
pm_iro_df['summary'] = pm_iro_df['summary'].str.lower()
pm_ullajohnson_df['summary'] = pm_ullajohnson_df['summary'].str.lower()
pm_zimmermann_df['summary'] = pm_zimmermann_df['summary'].str.lower()
pm_therow_df['summary'] = pm_therow_df['summary'].str.lower()
pm_rouje_df['summary'] = pm_rouje_df['summary'].str.lower()

#### Assigning the items to categories based on keywords pulled from the summary description

In [134]:
tops = ['top', 'shirt', 'tee', 'tank']
bottoms = ['skirt', 'pants', 'pant', 'short', 'crop', 'crops', 'shorts', 'leggings', 'tights', 'jeans']
dresses = ['dress']
#jumpsuits = ['jumpsuit']
bras = ['bra', 'bras']
outerwear = ['jacket', 'hoodie', 'sweater', 'sweatshirt', 'rainjacket', 'coat', 'parka', 'turtleneck']
#accessories = ['headband', 'bag', 'beanie', 'hat', 'toque', 'scarf', 'gloves', 'socks']
#bag = ['backpack', 'tote', 'duffel']

def fun(a):
    for i in tops:
        if i in a:
            return "tops"
    for j in bottoms:
        if j in a:
            return "bottoms"
    for k in bras:
        if k in a:
            return "bras"
    for l in dresses:
        if l in a:
            return "dresses"
    for m in outerwear:
        if m in a:
            return "outerwear"
    return "other"


#pm_lululemon_df.category = pm_lululemon_df.summary.apply(lambda x: fun(x))
#pm_ref_df.category = pm_ref_df.summary.apply(lambda x: fun(x))
pm_sezane_df.category = pm_sezane_df.summary.apply(lambda x: fun(x))
pm_doen_df.category = pm_doen_df.summary.apply(lambda x: fun(x))
pm_realisationpar_df.category = pm_realisationpar_df.summary.apply(lambda x: fun(x))
pm_iro_df.category = pm_iro_df.summary.apply(lambda x: fun(x))
pm_ullajohnson_df.category = pm_ullajohnson_df.summary.apply(lambda x: fun(x))
pm_zimmermann_df.category = pm_zimmermann_df.summary.apply(lambda x: fun(x))
pm_therow_df.category = pm_therow_df.summary.apply(lambda x: fun(x))
pm_rouje_df.category = pm_rouje_df.summary.apply(lambda x: fun(x))

#### Assign condition

In [94]:
new = ['new', 'nwt', 'nwot', 'never']

def fun(b):
    for i in new:
        if i in b:
            return "new"
    return "preowned"

#pm_lululemon_df.condition = pm_lululemon_df.summary.apply(lambda x: fun(x))
#pm_ref_df.condition = pm_ref_df.summary.apply(lambda x: fun(x))
pm_sezane_df.condition = pm_sezane_df.summary.apply(lambda x: fun(x))
pm_doen_df.condition = pm_doen_df.summary.apply(lambda x: fun(x))
pm_realisationpar_df.condition = pm_realisationpar_df.summary.apply(lambda x: fun(x))
pm_iro_df.condition = pm_iro_df.summary.apply(lambda x: fun(x))
pm_ullajohnson_df.condition = pm_ullajohnson_df.summary.apply(lambda x: fun(x))
pm_zimmermann_df.condition = pm_zimmermann_df.summary.apply(lambda x: fun(x))
pm_therow_df.condition = pm_therow_df.summary.apply(lambda x: fun(x))
pm_rouje_df.condition = pm_rouje_df.summary.apply(lambda x: fun(x))

#### Strip dollar sign from price

In [89]:
#pm_lululemon_df['price'] = pm_lululemon_df['price'].str.replace('$', '')
#pm_lululemon_df['price'] = pm_lululemon_df['price'].astype(int)
#pm_ref_df['price'] = pm_ref_df['price'].str.replace('$', '')
#pm_ref_df['price'] = pm_ref_df['price'].astype(int)

pm_sezane_df['price'] = pm_sezane_df['price'].str.replace('$', '')
pm_sezane_df['price'] = pm_sezane_df['price'].astype(int)
pm_doen_df['price'] = pm_doen_df['price'].str.replace('$', '')
pm_doen_df['price'] = pm_doen_df['price'].astype(int)
pm_realisationpar_df['price'] = pm_realisationpar_df['price'].str.replace('$', '')
pm_realisationpar_df['price'] = pm_realisationpar_df['price'].astype(int)
pm_iro_df['price'] = pm_iro_df['price'].str.replace('$', '')
pm_iro_df['price'] = pm_iro_df['price'].astype(int)
pm_ullajohnson_df['price'] = pm_ullajohnson_df['price'].str.replace('$', '')
pm_ullajohnson_df['price'] = pm_ullajohnson_df['price'].astype(int)
pm_zimmermann_df['price'] = pm_zimmermann_df['price'].str.replace('$', '')
pm_zimmermann_df['price'] = pm_zimmermann_df['price'].astype(int)
pm_therow_df['price'] = pm_therow_df['price'].str.replace('$', '')
pm_therow_df['price'] = pm_therow_df['price'].astype(int)
pm_rouje_df['price'] = pm_rouje_df['price'].str.replace('$', '')
pm_rouje_df['price'] = pm_rouje_df['price'].astype(int)

"\npm_sezane_df['price'] = pm_sezane_df['price'].str.replace('$', '')\npm_sezane_df['price'] = pm_sezane_df['price'].astype(int)\npm_doen_df['price'] = pm_doen_df['price'].str.replace('$', '')\npm_doen_df['price'] = pm_doen_df['price'].astype(int)\npm_realisationpar_df['price'] = pm_realisationpar_df['price'].str.replace('$', '')\npm_realisationpar_df['price'] = pm_realisationpar_df['price'].astype(int)\npm_iro_df['price'] = pm_iro_df['price'].str.replace('$', '')\npm_iro_df['price'] = pm_iro_df['price'].astype(int)\npm_ullajohnson_df['price'] = pm_ullajohnson_df['price'].str.replace('$', '')\npm_ullajohnson_df['price'] = pm_ullajohnson_df['price'].astype(int)\npm_zimmermann_df['price'] = pm_zimmermann_df['price'].str.replace('$', '')\npm_zimmermann_df['price'] = pm_zimmermann_df['price'].astype(int)\npm_therow_df['price'] = pm_therow_df['price'].str.replace('$', '')\npm_therow_df['price'] = pm_therow_df['price'].astype(int)\npm_rouje_df['price'] = pm_rouje_df['price'].str.replace('$',

In [100]:
pm_rouje_df.head()

Unnamed: 0,summary,price,size,brand,condition,category,site
0,nwot rouje gabin dress red,150,S,rouje,new,,poshmark
1,rouje dress,71,00,rouje,preowned,,poshmark
2,nwot rouje gabin dress navy,120,S,rouje,new,,poshmark
3,round gabin dress,175,French 36/US 4,rouje,preowned,,poshmark
4,red “elisa” rouje blouse,50,2,rouje,preowned,,poshmark


#### Finally, add a site column and save as CSV for now

In [93]:
#pm_lululemon_df['site'] = 'poshmark'
#pm_ref_df['site'] = 'poshmark'
pm_sezane_df['site'] = 'poshmark'
pm_doen_df['site'] = 'poshmark'
pm_realisationpar_df['site'] = 'poshmark'
pm_iro_df['site'] = 'poshmark'
pm_ullajohnson_df['site'] = 'poshmark'
pm_zimmermann_df['site'] = 'poshmark'
pm_therow_df['site'] = 'poshmark'
pm_rouje_df['site'] = 'poshmark'

In [155]:
pd.DataFrame.to_csv(pm_sezane_df, 'pm_sezane_df.csv')
pd.DataFrame.to_csv(pm_doen_df, 'pm_doen_df.csv')
pd.DataFrame.to_csv(pm_realisationpar_df, 'pm_realisationpar_df.csv')
pd.DataFrame.to_csv(pm_iro_df, 'pm_iro_df.csv')
pd.DataFrame.to_csv(pm_ullajohnson_df, 'pm_ullajohnson_df.csv')
pd.DataFrame.to_csv(pm_zimmermann_df, 'pm_zimmermann_df.csv')
pd.DataFrame.to_csv(pm_therow_df, 'pm_therow_df.csv')
pd.DataFrame.to_csv(pm_rouje_df, 'pm_rouje_df.csv')