In [1]:
import requests
import time
import random
import sqlite3
import re
import psycopg2


from bs4 import BeautifulSoup
from sqlalchemy import create_engine, text

## Create the Database (SQLite)

In [39]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn


def select_all_tasks(conn, table_name):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute(f"SELECT * FROM {table_name}")

    print_query_result(cur)
    
def print_query_result(cursor):
    
    rows = cursor.fetchall()

    for row in rows:
        print(row)

## Create the Database (PostgreSQL)

In [2]:
db_address = 'localhost'
db_port = 5432
db_user = 'joconnor'
db_password = 'james_password'
db_name = 'resale_app'
postgres_str = (f'postgresql://{db_user}:{db_password}@{db_address}:{db_port}/{db_name}')
# Create the connection

cnx = create_engine(postgres_str)

In [3]:
def request_get_html(url):
    html = requests.get(url)
    return html.text


def format_string_result(string):
    result = ' '.join(string.split())
    return result.upper()


def calculate_wait_time(lower_limit, upper_limit):
    return random.uniform(lower_limit, upper_limit)


def get_value_from_html(html, tag_type, class_name, tag_attrib=None, str_replace=None):
    try:
        if tag_attrib is not None:
            value = html.find(tag_type, {"class": class_name})[tag_attrib]
        
        else: 
            value = html.find(tag_type, {"class": class_name}).text
    
    except:
        return "N/A"
    
    if str_replace:
        for repl in str_replace:
            value = value.replace(repl, "")
    
    return format_string_result(value)

## Get the latest sold items 

### Available Categories
- Jackets & Coats
- Jeans
- Pants & Jumpsuits
- Short 
- Tops



### Available Genders/ Groups
- Women
- Men
- Kids
- Home
- Pets


The idea here is (ethically) retrieve data about sold items from the Poshmark ressellers website. 

If there are any breaches of user agreements here please get in contact with me! 

In [4]:
# URL Variables
base_url = "https://poshmark.com"
brand = 'Luis_Vuitton'
gender = "Women"
category = "Bags"
query_filter = "?availability=sold_out"
page_limit = 1

In [10]:
for i in range(page_limit):
        
    url = base_url + "/brand/" + brand + "-" + gender + "-" + category + query_filter + "&max_id=" + str(i+1)
    soup = BeautifulSoup(request_get_html(url), 'html.parser')
    
#   Wait to pull the next page for a few seconds   
    time.sleep(calculate_wait_time(5, 10))
    
    for html_details in soup.find_all("div", {"class": "card card--small"}):
        try:
            #   Get Item Name 
            item_name = get_value_from_html(html_details, "img", "ovf--h", tag_attrib="alt")

            #   Get Item Link
            link = base_url + get_value_from_html(html_details, "a", "tile__title tc--b", tag_attrib="href").lower()

            #   Get the property listing ID     
            item_id = get_value_from_html(html_details, "a", "tile__title tc--b", tag_attrib="data-et-prop-listing_id")

            #   Get the date added (need to regex a cloudfront url)
            date_added = get_value_from_html(html_details, "img", "ovf--h", tag_attrib="src")
            print(date_added)

            if date_added == 'N/A':
                date_added = get_value_from_html(html_details, "img", "ovf--h", tag_attrib="data-src")

            date_added = re.search('\d{4}\/\d{2}\/\d{2}', date_added)
            date_added = date_added.group(0)

             #   Get List Price
            list_price = get_value_from_html(html_details, "span", "p--l--1 tc--lg td--lt", str_replace=["$", ","])  

            #   Get Sale Price
            sale_price = get_value_from_html(html_details, "span", "p--t--1 fw--bold", str_replace=["$", ","])        

            #   Get Condition - Not always known
            condition = get_value_from_html(html_details, "span", "condition-tag all-caps tr--uppercase condition-tag--small")

            #   Get Size - Not always known
            size = get_value_from_html(html_details, "a", "tile__details__pipe__size ellipses", str_replace=["Size: "])

            #   Get Brand - Not always known
            brand = get_value_from_html(html_details, "a", "tile__details__pipe__brand ellipses")

            #   Get the amount of likes the picture recieved 
            likes_div = html_details.find('div', {"class": 'social-action-bar tile__social-actions'})

            
            try:
                likes = likes_div.find('span').text
            except:
                likes = 0

            #   Get the amount of comments the picture recieved 
            comments_div = html_details.find('div', {"class": 'd--fl ai--c jc--sb'})

            try:
                comments = comments_div.find('span').text
            except:
                comments = 0
               
            with cnx.connect() as conn:
                conn.execute(
                text("""INSERT INTO sold_items_women (item_id,gender,category,size,name,list_price,sale_price,condition,link,date_added, likes, comments, brand_name) \ 
                      VALUES (:item_id,:gender,:category,:size,:name,:list_price,:sale_price,:condition,:link,:date_added,:likes,:comments,:brand_name)")""",
                    [{"item_id": item_id, 
                      "gender": gender, 
                      "category": category,
                      "size": size, 
                      "name": item_name,
                      "list_price": list_price, 
                      "sale_price": sale_price,
                      "condition": condition, 
                      "link": link,
                      "date_added": date_added
                     }]
                    )
                )
                conn.commit()
                    
        except Exception as e: 
            print(e)
            continue

HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2021/10/29/617C9C9312D880150DE93C04/S_617C9CB067BD9178A714932D.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2021/09/30/6156528BC936AF089A798888/S_6156529F93649FA111D5BD9D.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2021/10/20/617092BF6F6C91AD9AAB52BC/S_6170997EC32EE6BB620894FF.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2021/10/22/61734CBB4FD23ADC022CFE6E/S_61734D35BCDB2FE90FF2B225.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2021/10/29/617C938F691412E77799A49B/S_617C9424CE1E879C07270A87.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2020/09/13/5F5E76C3163DF40DBE4DAC4F/S_5F5E7DA367BD919D71907678.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POSTS/2021/09/27/6152679CE107BBB9D3EB4BBA/S_6152679DE107BBB9D3EB4BC0.JPG
name 'name' is not defined
HTTPS://DI2PONV0V5OTW.CLOUDFRONT.NET/POST

## 2. Insert the data into the database