*A document in progress showing my scraper development.*

### Import modules
*(Always do this step before running any other code chunks)*

In [1]:
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import certifi
import urllib3
import pandas as pd 
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## All listings (brand = lululemon)

Here, I specify the URL for **sold/completed** listings of **lululemon** brand clothing. (It's all in the link). Reminder to check the [robot.txt](https://poshmark.com/robots.txt) for the site you want to scrape- don't get blocked as a bot!

I'm also identifying the page for BeautifulSoup.

In [2]:
urlpage = 'https://poshmark.com/brand/lululemon_athletica-Women?availability=sold_out'

http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
r = http.request('GET', urlpage)
page = urllib.request.urlopen(urlpage).read()
soup = BeautifulSoup(page, 'html.parser')
# print(page)

I'm using the chunk below for testing snippets of code- ignore or use it as a scratchpad.

In [51]:
# For testing
item_containers = soup.find_all('div', {'class': 'tile'})
#print(len(item_containers)) # should be about 4 dozen
#item_containers[0]


48

### Running the scraper
On this site, there are 48 listings per page. We are starting to scrape at page 1; I want to scrape 300 pages to start, and see how that does. I'm retrieving 1) title/item details in a messy HTML chunk, to be cleaned later; 2) price.

In [53]:
page_num = 1

# Create lists to store the scraped information
summary = []
price = []
size = []

while page_num<=48: # I checked- 48 pages, 48 items per page
    html = requests.get(urlpage.format(page_num)).text
    soup = BeautifulSoup(html, 'html.parser')
    
    for d in soup.findAll("div",{"class":"tile"}):
        title = d.a.get("title")
        pr = d.get("data-post-price")
        si = d.get("data-post-size")
    
        summary.append(title)
        price.append(pr)
        size.append(si)
     
    page_num=page_num+1

# Check the results
print(len(summary))
print(len(price))

2304
2304


### Storing the data in a pandas dataframe

In [54]:
pm_lululemon_df = pd.DataFrame({'Summary': summary, 'Price': price, 'Size': size})
print(pm_lululemon_df.info())
pm_lululemon_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2304 entries, 0 to 2303
Data columns (total 3 columns):
Summary    2304 non-null object
Price      2304 non-null object
Size       2304 non-null object
dtypes: object(3)
memory usage: 54.1+ KB
None


Unnamed: 0,Summary,Price,Size
0,Lululemon pace rival crop 22”,$60,6
1,lulu lemon backpack,$20,OS
2,Lululemon Scuba Sparkle III Lavender Hoodie,$10,4
3,Maroon lululemon shorts,$32,6
4,Lululemon sports bra,$15,6


## All listings (brand = Reformation)

In this instance, I am applying the scraper to retrieve **sold/completed** listings of **Reformation** brand clothing. 

In [56]:
urlpage = 'https://poshmark.com/search?brand%5B%5D=Reformation&department=All'

http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
r = http.request('GET', urlpage)
page = urllib.request.urlopen(urlpage).read()
soup = BeautifulSoup(page, 'html.parser')

In [57]:
page_num = 1

# Create lists to store the scraped information
summary = []
price = []
size = []

while page_num<=48: # I checked- 48 pages, 48 items per page
    html = requests.get(urlpage.format(page_num)).text
    soup = BeautifulSoup(html, 'html.parser')
    
    for d in soup.findAll("div",{"class":"tile"}):
        title = d.a.get("title")
        pr = d.get("data-post-price")
        si = d.get("data-post-size")
    
        summary.append(title)
        price.append(pr)
        size.append(si)
     
    page_num=page_num+1

# Check the results
print(len(summary))
print(len(price))

2304
2304


In [59]:
pm_ref_df = pd.DataFrame({'Summary': summary, 'Price': price, 'Size': size})
print(pm_ref_df.info())
pm_ref_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2304 entries, 0 to 2303
Data columns (total 3 columns):
Summary    2304 non-null object
Price      2304 non-null object
Size       2304 non-null object
dtypes: object(3)
memory usage: 54.1+ KB
None


Unnamed: 0,Summary,Price,Size
0,Reformation Off-Shoulder Long Sleeve,$30,XS
1,Reformation Linen Set,$98,XS
2,NWT Reformation Britta Dress,$180,4
3,Reformation Gilda Green Jumpsuit,$135,10
4,REFORMATION serena melbourne destroyed jeans,$61,29


#### Adding new columns: Brand, Condition, Category

In [63]:
pm_lululemon_df['Brand'] = 'lululemon'
pm_ref_df['Brand'] = 'Reformation'

pm_lululemon_df['Condition'] = ''
pm_ref_df['Condition'] = ''

pm_lululemon_df['Category'] = ''
pm_ref_df['Category'] = ''

#### Convert all Summary strings to lowercase

In [118]:
pm_ref_df['Summary']= pm_ref_df['Summary'].str.lower()
pm_lululemon_df['Summary'] = pm_lululemon_df['Summary'].str.lower()

#### Assigning the items to categories based on keywords pulled from the Summary description

In [137]:
tops = ['top', 'shirt', 'tee', 'tank']
bottoms = ['skirt', 'pants', 'pant', 'short', 'crop', 'crops', 'shorts', 'leggings', 'tights', 'jeans']
dresses = ['dress']
jumpsuits = ['jumpsuit']
bras = ['bra', 'bras']
outerwear = ['jacket', 'hoodie', 'sweater', 'sweatshirt', 'rainjacket', 'coat', 'parka', 'turtleneck']
accessories = ['headband', 'bag', 'beanie', 'hat', 'toque', 'scarf', 'gloves', 'socks']
bag = ['backpack', 'tote', 'duffel']

def fun(a):
    for i in tops:
        if i in a:
            return "tops"
    for j in bottoms:
        if j in a:
            return "bottoms"
    for k in bras:
        if k in a:
            return "bras"
    for l in dresses:
        if l in a:
            return "dresses"
    for m in outerwear:
        if m in a:
            return "outerwear"
    for n in accessories:
        if n in a:
            return "accessories"
    for o in bag:
        if o in a:
            return "bag"
    for p in jumpsuits:
        if p in a:
            return "jumpsuits"
    return "other"

pm_lululemon_df.Category = pm_lululemon_df.Summary.apply(lambda x: fun(x))
pm_ref_df.Category = pm_ref_df.Summary.apply(lambda x: fun(x))

#pm_ref_df.tail(20)

#### Assign condition

In [141]:
New = ['new', 'nwt', 'nwot', 'never']

def fun(b):
    for i in New:
        if i in b:
            return "New"
    return "PreOwned"

pm_lululemon_df.Condition = pm_lululemon_df.Summary.apply(lambda x: fun(x))
pm_ref_df.Condition = pm_ref_df.Summary.apply(lambda x: fun(x))

#pm_lululemon_df.head(10)

#### Strip dollar sign from Price

In [148]:
pm_lululemon_df['Price'] = pm_lululemon_df['Price'].str.replace('$', '')
pm_lululemon_df['Price'] = pm_lululemon_df['Price'].astype(int)
pm_ref_df['Price'] = pm_ref_df['Price'].str.replace('$', '')
pm_ref_df['Price'] = pm_ref_df['Price'].astype(int)

Unnamed: 0,Summary,Price,Size,Brand,Condition,Category
0,lululemon pace rival crop 22”,60,6,lululemon,PreOwned,bottoms
1,lulu lemon backpack,20,OS,lululemon,PreOwned,bag
2,lululemon scuba sparkle iii lavender hoodie,10,4,lululemon,PreOwned,outerwear
3,maroon lululemon shorts,32,6,lululemon,PreOwned,bottoms
4,lululemon sports bra,15,6,lululemon,PreOwned,bras


#### Finally, add a Site column and save as CSV for now

In [149]:
pm_lululemon_df['Site'] = 'Poshmark'
pm_ref_df['Site'] = 'Poshmark'

pd.DataFrame.to_csv(pm_lululemon_df, 'pm_lululemon_df.csv') # Can also save to .json
pd.DataFrame.to_csv(pm_ref_df, 'pm_ref_df.csv')