In [None]:
pip install beautifulsoup4




In [None]:
# this block of code will search ebay for whatever shoe you want and save the sales history as html files that we can work with

import requests
import os
import shutil

# function that will automatically search ebay for whatever shoe you want and save the sold and completed listings as html files that we can work with
# params: search_term: whatever you'd put in the search bar of ebay
# num_pages: how many pages of results you want to save. Change this based off of how popular the shoe is. A niche shoe might only need 2 pages, but panda dunks might need a lot more
# output_dir: the folder where we save all the html files

# example of an ebay url: https://www.ebay.com/sch/i.html?_from=R40&_nkw=jordan+5+retro+oreo&rt=nc&LH_Sold=1&LH_Complete=1&_pgn=2
def fetch_and_save_ebay_pages(search_term, num_pages, output_dir):
    base_url = "https://www.ebay.com/sch/i.html"
    params = {
        '_from': 'R40',
        '_nkw': search_term,
        'rt': 'nc',
        'LH_Sold': 1,
        'LH_Complete': 1
    }

    # if you rerun this code, you don't want to mix the html files of your old shoe with your new shoe. This will delete the old folder.
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        print(f"Deleted existing directory: {output_dir}")

    # create the folder where we are going to store the html files.
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

    # loop through every page on ebay and store the shoes as html files
    for page in range(1, num_pages + 1):
        params['_pgn'] = page # we now care about the page number so add it as a parameter to the url
        response = requests.get(base_url, params=params) # GET request that actually goes to ebay

        if response.status_code == 200:
          # if it worked, then we write the file to our folder
            file_path = os.path.join(output_dir, f'{search_term.replace(" ", "_")}_page_{page}.html')
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f'Saved: {file_path}')
        else:
            # if it didn't, print out the status code
            print(f'Failed to fetch page {page}: Status code {response.status_code}')


# Call the function here!
fetch_and_save_ebay_pages('air jordan 5 retro oreo', 15, 'ebay_pages')


Deleted existing directory: ebay_pages
Created directory: ebay_pages
Saved: ebay_pages/air_jordan_5_retro_oreo_page_1.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_2.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_3.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_4.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_5.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_6.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_7.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_8.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_9.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_10.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_11.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_12.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_13.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_14.html
Saved: ebay_pages/air_jordan_5_retro_oreo_page_15.html


In [None]:
# this block of code parses the html files that we just saved and turns it into a pandas dataframe

from bs4 import BeautifulSoup
import csv
import pandas as pd
import os

html_dir = '/content/ebay_pages'
csv_file_path = '/content/output.csv'

# this will hold every shoe entry
data_list = []

# loop through every file that we saved from ebay and add the shoes to our data_list
for filename in os.listdir(html_dir):
    if filename.endswith('.html'):  # safeguard to make sure we don't read any junk
        file_path = os.path.join(html_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # parse html file
        soup = BeautifulSoup(html_content, 'html.parser')

        # find all shoes in the html file
        ebayShoes = soup.find_all('li', class_='s-item s-item__pl-on-bottom')

        # extract and append data for each shoe. If we can't find a value for some shoe, put 'N/A' instead
        for item in ebayShoes:
            # Sold Date
            sold_date_element = item.find('span', class_='POSITIVE')
            sold_date = sold_date_element.text.strip() if sold_date_element else 'N/A'

            # Title
            title_element = item.find('span', role='heading')
            title = title_element.text.strip() if title_element else 'N/A'

            # Condition
            condition_element = item.find('span', class_='SECONDARY_INFO')
            condition = condition_element.text.strip() if condition_element else 'N/A'

            # Brand
            brand_element = item.find(class_= "s-item__subtitle")
            brand = brand_element.text.strip() if brand_element else 'N/A'

            # Ratings
            ratings_element = item.find('span', class_='s-item__reviews-count')
            ratings = ratings_element.text.strip() if ratings_element else 'N/A'

            # Price
            price_element = item.find('span', class_='s-item__price')
            price = price_element.text.strip() if price_element else 'N/A'

            # Bids
            bids_element = item.find('span', class_='s-item__bids s-item__bidCount')
            bids = bids_element.text.strip() if bids_element else 'N/A'

            # Shipping
            shipping_element = item.find('span', class_='s-item__shipping s-item__logisticsCost')
            shipping = shipping_element.text.strip() if shipping_element else 'N/A'

            data_list.append({
                'Title': title,
                'Sold Date': sold_date,
                'Condition': condition,
                'Brand': brand,
                'Ratings': ratings,
                'Price': price,
                'Bids': bids,
                'Shipping': shipping
            })


data = pd.DataFrame(data_list) # switch to a data frame to make it easier to work with later on

# find duplicates and print them
duplicates = data[data.duplicated(keep=False)]
if not duplicates.empty:
    print("Number of duplicate entries:", duplicates.shape[0])

data_cleaned = data.drop_duplicates() # get rid of duplicates
data_cleaned = data_cleaned.iloc[1:].reset_index(drop=True) # the first line is always junk, so remove that
data_cleaned.to_csv(csv_file_path, index=False) # save it to a csv file
data_cleaned  # print


Number of duplicate entries: 487


Unnamed: 0,Title,Sold Date,Condition,Brand,Ratings,Price,Bids,Shipping
0,Nike Air Jordan 5 Retro Oreo Sz 10.5 2013,"Sold Mar 16, 2024",Pre-Owned,Pre-Owned · Jordan · Air Jordan,,$119.99,,+$14.95 shipping
1,Size 9 - Jordan 5 Retro oreo 2013 136027-035 B...,"Sold Mar 16, 2024",Pre-Owned,Pre-Owned · Jordan · Air Jordan,,$135.00,,+$14.95 shipping
2,Size 10 - Nike Air Jordan 5 V Retro OG Black W...,"Sold Mar 15, 2024",Pre-Owned,Pre-Owned · Nike · Air Jordan,,$100.00,,+$14.95 shipping
3,Size 8 - Jordan 5 Retro oreo 2013,"Sold Mar 15, 2024",Pre-Owned,Pre-Owned · Jordan · Air Jordan,,$27.00,1 bid,+$22.01 shipping
4,Size 11 - Jordan 5 Retro Mid Oreo,"Sold Mar 15, 2024",Pre-Owned,Pre-Owned · Jordan · Air Jordan,,$120.00,,+$14.95 shipping
...,...,...,...,...,...,...,...,...
468,Jordan 5 Retro Oreo/moonlight 2021 Size 12 Og All,"Sold Feb 16, 2024",Pre-Owned,Pre-Owned · Jordan · Air Jordan,10 product ratings - Jordan 5 Retro Oreo/moonl...,$225.00,,+$14.95 shipping
469,Size 11 - Air Jordan 5 V Retro Nike Mid Moonli...,"Sold Feb 16, 2024",Brand New,Brand New · Jordan · Air Jordan,12 product ratings - Size 11 - Air Jordan 5 V ...,$289.95,,+$14.95 shipping
470,Size 11 - Jordan 5 Oreos (will take offers).,"Sold Feb 16, 2024",Pre-Owned,Pre-Owned · Jordan · Air Jordan,12 product ratings - Size 11 - Jordan 5 Oreos ...,$120.00,,+$14.95 shipping
471,Size 10.5 - Jordan 5 Retro Mid Oreo,"Sold Feb 16, 2024",Brand New,Brand New · Jordan · Air Jordan,7 product ratings - Size 10.5 - Jordan 5 Retro...,$210.00,,+$14.95 shipping


In [None]:
# this block of code is where we do our data cleaning

import pandas as pd
import re

df = pd.DataFrame(data_cleaned)

# first, we need to convert the dates, which are strings to datetime, then we'll convert it to ordinal time
df['Sold Date'] = pd.to_datetime(df['Sold Date'].str.replace('Sold ', ''), format=' %b %d, %Y')
df['Sold Date'] = df['Sold Date'].apply(lambda x: x.toordinal())

# our models won't know what pre-owned and brand new is, so we encode pre-owned as 1 and brand new as 2 and New (Other) as 3
df['Condition'] = df['Condition'].replace({'Pre-Owned': 1, 'Brand New': 2, 'New (Other)': 3})

# if a product doesn't have any ratings, just make its entry 0, and if it does, get rid of the "x product ratings blah blah blah" so that we just have the number x
df['Ratings'] = df['Ratings'].apply(lambda x: 0 if x == 'N/A' else int(x.split()[0]))

# do the same with bids
df['Bids'] = df['Bids'].apply(lambda x: 0 if x == 'N/A' else int(x.split()[0]))

# ok now for the price, sometimes the price will be a range, so we just want to take the average of the range
def handle_price(price_str):
    price_str = price_str.replace('$', '').replace(',', '') # Remove both dollar signs and commas
    if ' to ' in price_str:
        low, high = price_str.split(' to ')
        return (float(low) + float(high)) / 2
    else:
        return float(price_str)

# for every entry in the price column, turn it into a float
df['Price'] = df['Price'].apply(handle_price)

# similarly, since the shipping column has a bunch of junk and different cases, we want a function to handle this
def handle_shipping(shipping_str):
    if 'Free' in shipping_str:
        return 0
    elif 'N/A' in shipping_str:
        return 0
    return float(shipping_str.replace('+$', '').replace(' shipping', ''))

df['Shipping'] = df['Shipping'].apply(handle_shipping)

# function to extract size from title
def extract_size(title_str):
    match = re.search(r'size\s+(\d+\.?\d*)', title_str, re.IGNORECASE) # filter only for strings that have Size then a space then a number
    if match:
        return float(match.group(1))
    return 0.0

# look through every title and apply our function and then add the result to a new column.
df['Shoe Size'] = df['Title'].apply(extract_size)
df

Unnamed: 0,Title,Sold Date,Condition,Brand,Ratings,Price,Bids,Shipping,Shoe Size
0,Nike Air Jordan 5 Retro Oreo Sz 10.5 2013,738961,1,Pre-Owned · Jordan · Air Jordan,0,119.99,0,14.95,0.0
1,Size 9 - Jordan 5 Retro oreo 2013 136027-035 B...,738961,1,Pre-Owned · Jordan · Air Jordan,0,135.00,0,14.95,9.0
2,Size 10 - Nike Air Jordan 5 V Retro OG Black W...,738960,1,Pre-Owned · Nike · Air Jordan,0,100.00,0,14.95,10.0
3,Size 8 - Jordan 5 Retro oreo 2013,738960,1,Pre-Owned · Jordan · Air Jordan,0,27.00,1,22.01,8.0
4,Size 11 - Jordan 5 Retro Mid Oreo,738960,1,Pre-Owned · Jordan · Air Jordan,0,120.00,0,14.95,11.0
...,...,...,...,...,...,...,...,...,...
468,Jordan 5 Retro Oreo/moonlight 2021 Size 12 Og All,738932,1,Pre-Owned · Jordan · Air Jordan,10,225.00,0,14.95,12.0
469,Size 11 - Air Jordan 5 V Retro Nike Mid Moonli...,738932,2,Brand New · Jordan · Air Jordan,12,289.95,0,14.95,11.0
470,Size 11 - Jordan 5 Oreos (will take offers).,738932,1,Pre-Owned · Jordan · Air Jordan,12,120.00,0,14.95,11.0
471,Size 10.5 - Jordan 5 Retro Mid Oreo,738932,2,Brand New · Jordan · Air Jordan,7,210.00,0,14.95,10.5


In [131]:
# this block of code is where we train the model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X = df.drop(['Price', 'Title', 'Brand'], axis=1)  # Features (all columns except 'Price')
y = df['Price']  # Target (the 'Price' column)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create model and train it
model = LinearRegression()
model.fit(X_train, y_train)

# predict it
y_pred = model.predict(X_test)

# grade the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2}")

# now we want it to predict what the price should be for our shoe that we want to buy or sell
# the parameters we have to fill are: date, condition, ratings, bids, shipping, size
new_shoe_features = np.array([[738992, 1, 0, 5, 10.00, 9.5]])
predicted_price = model.predict(new_shoe_features.reshape(1, -1))
print(f"The predicted price of the new shoe is: ${predicted_price[0]:.2f}")

Mean Squared Error: 3638.7272873559054
R^2 Score: 0.2979291081432921
The predicted price of the new shoe is: $92.85


