In [38]:
import requests
import time
import random
import sqlite3
import re

from bs4 import BeautifulSoup

## Create the Database

In [39]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn


def select_all_tasks(conn, table_name):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute(f"SELECT * FROM {table_name}")

    print_query_result(cur)
    
def print_query_result(cursor):
    
    rows = cursor.fetchall()

    for row in rows:
        print(row)

        
# # Create the table if it does not exist
# conn = create_connection('db/sold_items.db')
# conn.execute('''CREATE TABLE IF NOT EXISTS sold_items_women
#          (ID TEXT PRIMARY KEY     NOT NULL,
#          gender            TEXT    NOT NULL,
#          category          TEXT     NOT NULL,
#          name              CHAR(100),
#          list_price        CHAR(10),
#          sale_price        CHAR(10),
#          condition         CHAR(10),
#          link              CHAR(100),
#          likes             CHAR(4),
#          comments          CHAR(4),
#          date_added        CHAR(10))
#          ;''')
# conn.close()

## Helper Functions

In [40]:
def request_get_html(url):
    html = requests.get(url)
    return html.text


def format_string_result(string):
    result = ' '.join(string.split())
    return result.upper()


def calculate_wait_time(lower_limit, upper_limit):
    return random.uniform(lower_limit, upper_limit)


def get_value_from_html(html, tag_type, class_name, tag_attrib=None, str_replace=None):
    try:
        if tag_attrib is not None:
            value = html.find(tag_type, {"class": class_name})[tag_attrib]
        
        else: 
            value = html.find(tag_type, {"class": class_name}).text
    
    except:
        return "N/A"
    
    if str_replace:
        for repl in str_replace:
            value = value.replace(repl, "")
    
    return format_string_result(value)

## Get the latest sold items 

### Available Categories
- Jackets & Coats
- Jeans
- Pants & Jumpsuits
- Short 
- Tops



### Available Genders/ Groups
- Women
- Men
- Kids
- Home
- Pets


The idea here is (ethically) retrieve data about sold items from the Poshmark ressellers website. 

If there are any breaches of user agreements here please get in contact with me! 

In [41]:
# URL Variables
base_url = "https://poshmark.com"
brand = 'Nike'
gender = "Women"
category = "Shoes"
query_filter = "?availability=sold_out"
page_limit = 1

In [42]:
# df = pd.DataFrame(columns=['item_id', 'date_added', 'gender', 'category', 'name', 'size', 'brand', 'list_price', 'sale_price', 'condition', 'link', 'likes', 'comments'])
conn = create_connection('db/sold_items.db')


for i in range(page_limit):
        
    url = base_url + "/brand/" + brand + "-" + gender + "-" + category + query_filter + "&max_id=" + str(i+1)
    soup = BeautifulSoup(request_get_html(url), 'html.parser')
    
#   Wait to pull the next page for a few seconds   
    time.sleep(calculate_wait_time(5, 10))
    
    for html_details in soup.find_all("div", {"class": "card card--small"}):
        try:
            #   Get Item Name 
            item_name = get_value_from_html(html_details, "img", "ovf--h", tag_attrib="alt")

            #   Get Item Link
            link = base_url + get_value_from_html(html_details, "a", "tile__title tc--b", tag_attrib="href").lower()

            #   Get the property listing ID     
            item_id = get_value_from_html(html_details, "a", "tile__title tc--b", tag_attrib="data-et-prop-listing_id")

            #   Get the date added (need to regex a cloudfront url)
            date_added = get_value_from_html(html_details, "img", "ovf--h", tag_attrib="src")

            if date_added == 'N/A':
                date_added = get_value_from_html(html_details, "img", "ovf--h", tag_attrib="data-src")

            date_added = re.search('\d{4}\/\d{2}\/\d{2}', date_added)
            date_added = date_added.group(0)


             #   Get List Price
            list_price = get_value_from_html(html_details, "span", "p--l--1 tc--lg td--lt", str_replace=["$", ","])  

            #   Get Sale Price
            sale_price = get_value_from_html(html_details, "span", "p--t--1 fw--bold", str_replace=["$", ","])        

            #   Get Condition - Not always known
            condition = get_value_from_html(html_details, "span", "condition-tag all-caps tr--uppercase condition-tag--small")

            #   Get Size - Not always known
            size = get_value_from_html(html_details, "a", "tile__details__pipe__size ellipses", str_replace=["Size: "])

            #   Get Brand - Not always known
            brand = get_value_from_html(html_details, "a", "tile__details__pipe__brand ellipses")

            #   Get the amount of likes the picture recieved 
            likes_div = html_details.find('div', {"class": 'social-action-bar tile__social-actions'})

            
            try:
                likes = likes_div.find('span').text
            except:
                likes = 0

            #   Get the amount of comments the picture recieved 
            comments_div = html_details.find('div', {"class": 'd--fl ai--c jc--sb'})

            try:
                comments = comments_div.find('span').text
            except:
                comments = 0
               
            
#           Database Insertion
            insertion_query = f'''
                INSERT INTO sold_items_women (ID,gender,category,size,name,list_price,sale_price,condition,link,date_added, likes, comments, brand_name)
                VALUES ("{item_id}", "{gender}", "{category}", "{size}", "{name}", "{list_price}", "{sale_price}", "{condition}", "{link}", "{date_added}",  "{likes}",  "{comments}", "{brand}")
                '''
            conn.execute(insertion_query)
            # Commit the changes
            conn.commit()
        
        except Exception as e: 
            print(e)
            continue
    
    
# Close the connection
conn.close()

SyntaxError: invalid syntax (3826672748.py, line 44)

## 2. Insert the data into the database

In [35]:
conn = create_connection('db/sold_items.db')
cur = conn.cursor()
cur.execute('''Select count(1) from sold_items_women where cast(sale_price as int)''')
print_query_result(cur)

(1410,)


- User inputs
    - Brand
    - Size 
    - Type

 
- This is going to be used to estimate the resale price

What we need to do is take this input, retrieve the data from the database, and calculate average sale value for 
    - This particular size
    - Other sizes
    - What they normally list for versus what they sell for
    