In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from fake_useragent import UserAgent
import time
import pandas as pd

In [2]:
def get_page(url):
    response = requests.get(url,headers={'User-agent': UserAgent().random}) 
    
    if not response.ok:
        print('Server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'lxml')
    return soup

In [3]:
df = pd.read_pickle('data/clean_products_df.pkl')

In [5]:
# Visit each product website, extract number of items sold, and store it as a new column
# Need to convert to int later. Converting is not available here yet.
# Note: Takes a long time depending on the number of rows in the data frame.

solds = []
print(f"Done scraping product...")
for i, link in enumerate(df.link):    
    try: 
        solds.append(get_page(link).find('span', attrs = {'class': 'vi-qty-pur-lnk'}).find('a').text)
        time.sleep(2)
    except: 
        solds.append(0)
        print(f"Exception happened at {i+1}th link", end = '   ')
    finally: print(f"{i+1} ", end = '   ')

df['updated_sold'] = solds
df.head(10)

Done scraping product...
Exception happened at 1th link   1    2    3    4    5    6    7    8    9    Exception happened at 10th link   10    Exception happened at 11th link   11    12    13    Exception happened at 14th link   14    15    16    17    18    Exception happened at 19th link   19    20    21    22    23    24    25    26    27    28    29    30    31    32    33    34    35    36    37    38    39    40    41    42    43    44    45    46    47    48    49    50    Server responded: 404
Exception happened at 51th link   51    52    53    54    55    56    57    58    59    60    61    Server responded: 404
Exception happened at 62th link   62    63    64    65    66    67    68    69    70    71    72    73    74    75    Exception happened at 76th link   76    77    78    79    80    81    82    83    84    85    86    87    88    89    90    91    92    93    Exception happened at 94th link   94    95    96    97    98    99    100    101    102    103    104    105   

Unnamed: 0,epid,name,link,price,rating,num_ratings,watcher,shipping,free_return,open_box,pre_owned,refurbished,benefits_charity,last_one,sold,updated_sold
0,24033837546.0,GK806 Wire Gaming Keyboard and Mouse Combo Bac...,https://www.ebay.com/itm/GK806-Wire-Gaming-Key...,29.98,4.5,4,0,0.0,0,0.0,0.0,0.0,0,0,238,0
1,,iClever 2.4G Portable Wireless Keyboard and M...,https://www.ebay.com/itm/iClever-2-4G-Portable...,20.99,,0,0,0.0,0,0.0,0.0,0.0,0,0,533,550 sold
2,,3-Colors Backlit USB Wired Gaming Keyboard Mul...,https://www.ebay.com/itm/3-Colors-Backlit-USB-...,16.99,,0,0,0.0,1,0.0,0.0,0.0,0,0,217,219 sold
3,25026719178.0,Apple Magic Keyboard 2 (MLA22LL/A) Rechargeabl...,https://www.ebay.com/itm/Apple-Magic-Keyboard-...,62.99,5.0,430,57,0.0,1,0.0,0.0,1.0,0,0,43,53 sold
4,,Mini Wireless Keyboard And Mouse Set Waterproo...,https://www.ebay.com/itm/Mini-Wireless-Keyboar...,21.98,,0,0,0.0,1,0.0,0.0,0.0,0,0,1646,"1,670 sold"
5,,Computer Desktop Gaming Keyboard and Mouse Mec...,https://www.ebay.com/itm/Computer-Desktop-Gami...,28.52,,0,0,0.0,0,0.0,0.0,0.0,0,0,819,840 sold
6,22031090242.0,HP USB Slim Keyboard,https://www.ebay.com/itm/HP-USB-Slim-Keyboard/...,14.5,4.5,20,0,0.0,1,1.0,0.0,0.0,0,0,21,30 sold
7,16030518455.0,Rosewill NEON K51 Hybrid Mechanical RGB Gaming...,https://www.ebay.com/itm/Rosewill-NEON-K51-Hyb...,34.99,4.5,20,0,0.0,1,0.0,0.0,0.0,0,0,81,88 sold
8,,Led Light Backlit Computer Desktop Gaming Wire...,https://www.ebay.com/itm/Led-Light-Backlit-Com...,24.99,,0,22,0.0,0,0.0,0.0,0.0,0,0,77,87 sold
9,,Genuine Apple Aluminum Wired USB Slim Keyboard...,https://www.ebay.com/itm/Genuine-Apple-Aluminu...,24.99,,0,0,0.0,1,0.0,1.0,0.0,0,0,0,0


In [4]:
# Remove sold == 0 when there is at least one rating or average rating is not NA.
df = df.loc[~((df.sold == 0) & (df.updated_sold == 0) & ((df.num_ratings > 0) | (~df.rating.isna())))]

# Clean updated_sold
mask = df.updated_sold.map(lambda x: "sold" in x if type(x) != int else False)
df.loc[mask, "updated_sold"] = df.loc[mask, "updated_sold"].map(lambda x: int(x.split(' sold')[0].strip().replace(',','')))
df.loc[:,"updated_sold"] = pd.to_numeric(df.updated_sold)

df['early_sold'] = df['sold']
df['later_sold'] = df['updated_sold']

# If both are non-zero, select the higher one
mask = df.early_sold >= df.later_sold
df.loc[mask, "sold"] = df.loc[mask, "early_sold"]
df.loc[~mask, "sold"] = df.loc[~mask, "later_sold"]

df = df.drop(columns = 'updated_sold')

# pd.to_pickle(df, 'data/clean_products_df.pkl')

In [5]:
# df.to_csv('clean_products_df.csv')
# Manually deleted keyboards that contains 'music', 'yamaha', 'accordion', 'New Listing' due to time
df = pd.read_csv('clean_products_df.csv')

If we want just brand new products (To reduze zero-sold items)

In [6]:
df = df.loc[(df.pre_owned == 0) & (df.open_box == 0) & (df.refurbished == 0)] # Just the brand new products
df = df.drop(columns = ['pre_owned', 'open_box', 'refurbished'])

# pd.to_pickle(df, 'data/brand_new_products.pkl')

If we want to remove rows with null epid when sold is zero (to reduce zero-sold items)

In [7]:
df = df.loc[~((df.sold == 0)&(df.epid == ''))]

  res_values = method(rvalues)


In [8]:
# pd.to_pickle(df, 'data/zero_reduced_df.pkl')