In [9]:
import numpy as np
import matplotlib.pyplot as plt # for plotting price data
import pandas as pd
from pandas.errors import EmptyDataError # error produced if empty csv if parsed

from bs4 import BeautifulSoup
import requests # fetches html content of a website, instead of urllib2 previously
from urllib.request import HTTPError # for catching timeout for website response

import time # for sleep function
from datetime import datetime # for timestamp

import os # for creation of directories

In [10]:
class AmazonPriceTracker:
    
    def __init__(self, tracker_name="tracker"):
        self.items = {"nicknames": [], "names": [], "asins": [], "urls": []}
        self.name = tracker_name
        self.PATH = "./" + self.name + "/"
        try:
            os.mkdir(str(self.name))
        except FileExistsError:
            print("This tracker already exists. Using the existing one instead.")
        
        self.price_history = {}
        self.__retrieve_items()
        
        DateTime = ["year", "month", "day", "hour", "minute"]
        self.price_history = pd.DataFrame(columns=DateTime+self.items["nicknames"])
        
        self.__retrieve_price_hist()
        self.latest_prices = self.price_history.tail(1)
        
    def __webpage2html(self, URL, parser="html.parser"):
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        }
        res = requests.get(URL, headers=headers)

        soup = BeautifulSoup(res.text, 'html.parser')
        return soup

        
    def add_item(self, URL, nickname):
        if "amazon" not in URL:
            print("This is not a valid amazon url.")
        else:
            ASIN = URL.split("/")[4]
            URL = "/".join(URL.split("/")[:5])
            if ASIN not in self.items["asins"]:
                print("Adding item to list of tracked items.")
                try:
                    soup = self.__webpage2html(URL)

                    # extract name
                    for element in soup.find_all("span"):
                        if "productTitle" in str(element):
                            title_containing_str = str(element)
                            break
                    title_containing_str_start = title_containing_str.find(">")+1
                    title_containing_str_end = title_containing_str.find("</")
                    title_raw = title_containing_str[title_containing_str_start:title_containing_str_end]
                    title = title_raw.replace("\n", "").replace("  ", "")

                    # save title and URL to txt
                    f = open(self.PATH + "tracked_items.txt","a", newline="\n")
                    if title not in self.items["names"]:
                        f.write(nickname + " : " + title + " : " + URL + " : " + ASIN + "\n")
                    f.close()

                    # save title and URL to dict
                    self.items["names"].append(title)
                    self.items["urls"].append(URL)
                    self.items["nicknames"].append(nickname)
                    self.items["asins"].append(ASIN)
                    print("{} was succesfully added to list of tracked items.".format(nickname))

                except HTTPError:
                    print("HTTP 503 Error, try to add item again later.")
            else:
                print("This item is already being tracked.")
            
            
    def __retrieve_items(self):
        # retrieve tracked items
        try:
            f = open(self.PATH + "tracked_items.txt", "r")
            if f.read() == "":
                print("No items are being tracked so far. \
                Please add an item to be tracked using .add_item().")
                f.close()
            else:
                f = open(self.PATH + "tracked_items.txt", "r")
                lines = f.readlines()
                for line in lines:
                    nickname, title, url, asin = line.split(" : ")
                    if asin[:-1] not in self.items["asins"]:
                        self.items["names"].append(title)
                        self.items["urls"].append(url)
                        self.items["nicknames"].append(nickname)
                        self.items["asins"].append(asin[:-1])
            f.close()
        except FileNotFoundError:
            open(self.PATH + "tracked_items.txt", "x")
    
    
    def __retrieve_price_hist(self):
        try:
            self.price_history = pd.read_csv(self.PATH + "price_history.csv")
        except FileNotFoundError:
            open(self.PATH + "price_history.csv", "x")
        except EmptyDataError:
            if len(self.items["names"]) > 0:
                print("The price history is empty so far. \
                Please fetch prices using .fetch_prices() first.")
            else:
                pass
        
        
    def wipe_database(self):
        # delete contents of files
        items = open(self.PATH + "tracked_items.txt", "w")
        items.write("")
        items.close()
        
        hist = open(self.PATH + "price_history.csv", "w")
        hist.write("")
        hist.close()
        
        
    def fetch_prices(self, URLs=None):  
        # extract price
        if URLs is None:
            URLs = self.items["urls"]
        error_status = False
        delay = 1 # delay between fetching items in s
        DateTime = ["year", "month", "day", "hour", "minute"]
        new_row = pd.DataFrame(columns=DateTime+self.items["nicknames"])
        if len(self.items["names"]) > 0:
            for n, URL in enumerate(URLs):
                try:
                    print("Fetching price for {}.".format(self.items["nicknames"][n]))
                    soup = self.__webpage2html(URL, "lxml")
                    time.sleep(delay)

                    price_str = soup.select("#priceblock_ourprice")[0].text.replace(",",".")
                    price = float(price_str[:price_str.index(".")+3])
                    item_name = self.items["nicknames"][n]
                    new_row[item_name] = [price]
                    
                except HTTPError:
                    item_name = self.items["nicknames"][n]
                    new_row[item_name] = [np.NaN]
                    print("\n A price for {} could not be fetched.".format(item_name))
                    error_status = True
        else:
            print("There is no items to fetch a price for. Please add items using .add_item() first.")

        now = datetime.now()
        datetime_vec = now.timetuple()[0:5]
        new_row[DateTime] = datetime_vec
        new_row.index = range(self.price_history.shape[0],self.price_history.shape[0]+1)
        self.price_history = self.price_history.append(new_row, sort=False, ignore_index=True)
        self.latest_prices = self.price_history.tail(1)

        # save price history
        self.price_history.to_csv(self.PATH + "price_history.csv", index_label=False, index=False)
        return error_status
                
        
    def remove_item(self):
        print("The items currently being tracked are: \n")
        for i in range(len(self.items["nicknames"])):
            print("[" + str(i) + "] --> " + self.items["nicknames"][i])
        Input = input("\n To remove an item from tracking enter the corresponding number.\
        \n To cancel, press 'Enter'. ")
        if Input.isdigit():
            item2delete_idx = int(Input)
            if item2delete_idx < len(self.items["nicknames"]):
                item_name = self.items["nicknames"][item2delete_idx]

                # remove from hist
                self.price_history = self.price_history.drop(item_name, axis=1)
                
                # remove from tracked items
                self.items["names"].pop(item2delete_idx)
                self.items["nicknames"].pop(item2delete_idx)
                self.items["urls"].pop(item2delete_idx)
                self.items["asins"].pop(item2delete_idx)
                
                # remove from corresponding .txt and .csv
                f_read = open(self.PATH + "tracked_items.txt", "r")
                lines = f_read.readlines()
                lines.pop(item2delete_idx)
                f_write = open(self.PATH + "tracked_items.txt", "w")
                f_write.write("".join(lines))
                f_read.close()
                f_write.close()
                
                self.price_history.to_csv(self.PATH + "price_history.csv", index_label=False)
                
                print("Item was removed.")
            else:
                print("The input does not correspond to an item.")
        elif Input == "":
            print("The action has been canceled.")
        else:
            print("The input is not valid.")
        

    def plot_prices(self, timescale="day"):
        fig = plt.figure(figsize=(10,6))
        time_axis = self.price_history[timescale]
        tracked_items = list(tracker.price_history.columns)[5:]
        for item in tracked_items:
            plt.plot(time_axis,tracker.price_history[item], "-o" , label=item)
        
        plt.legend()
        plt.grid()
        plt.xlabel(timescale + "s")
        plt.ylabel("Price in €")
        plt.show()
        
        
    def current_prices(self):
        self.fetch_prices()
        current_price = self.latest_prices
        return current_price

        
    def deploy(self):
        while True:
            _time = datetime.now().timetuple()[2:5]
            today = _time[0]
            hour = _time[1]
            minute = _time[2]
            try:
                prev_year, prev_month, prev_day, *_ = np.loadtxt(self.PATH + "price_history.csv", skiprows=1, delimiter=",")[-1]
            except TypeError:
                prev_year, prev_month, prev_day, *_ = np.loadtxt(self.PATH + "price_history.csv", skiprows=1, delimiter=",")
            except StopIteration:
                prev_year, prev_month, prev_day = -1, -1, -1
            print("Checking time...")
            if hour == 0 and (minute < 59 and minute > 0):
                if prev_day != today:
                    attempt = 1
                    URLs = np.array(self.items["urls"])
                    while attempt < 10:
                        try:
                            print("Attempt {} to fetch prices.".format(attempt))
                            status = self.fetch_prices(URLs)
                            if status == 0:
                                print("Fetching was a success!")
                                print("...waiting for next fetch.")
                                break
                            else:
                                latest_prices = self.price_history.iloc[-1,5:]
                                fails = np.array(latest_prices.isna())
                                URLs = URLs[fails]
                                attempt += 1
                                nicknames_of_fails = np.array(self.items["nicknames"])[fails]
                                print("Encountered an error while fetching prices for {}. Trying again in 10 min.".format(list(nicknames_of_fails)))
                                time.sleep(10*60)
                        except HTTPError:
                            print("HTTP 503 Error, trying again in 10 minutes.")
                            attempt += 1
                            time.sleep(10*60)
                else:
                    print("Item prices have already been updated today.")
            time.sleep(59*60)

In [3]:
#     def notify(self, email):
#             send email with price plot

#     def request_update(self.):
#         update prices and send email with current prices and history of them at the request
  
# def check_connectivity(self):
    
# add functionality to see how many items are left in stock if possible!!!
# add functionality to recieve an email every day with plot of price developement and if anything as changed
# add functionality to compare prices to other vendors
# e.g. open with urllib https://www.amazon.de/gp/offer-listing/ (ASIN --> B07SXMZLPK) /ref=dp_olp_new_mbc?ie=UTF8&condition=new
# and scrape webpage for all the prices in soup.select("#olpOfferList")[0].div.div

In [4]:
tracker = AmazonPriceTracker()

This tracker already exists. Using the existing one instead.


In [5]:
URL1 = "https://www.amazon.de/dp/B07H289S79/ref=sr_1_1_sspa?crid=10NNT7QDBOWO4&keywords=4+TB+Seagate&qid=1576506626&sprefix=seagate+%2Caps%2C182&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzMFZLNUFTSk5VQ0xFJmVuY3J5cHRlZElkPUEwNTEyOTI2M01OWldDVDI4SkYxMCZlbmNyeXB0ZWRBZElkPUEwNDkxOTQ0MVM0UVZGR0hKWkZUUSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
URL2 = "https://www.amazon.de/dp/B07SXMZLPK/ref=sr_1_4?keywords=ryzen+3700x&qid=1577491082&sr=8-4"
URL3 = "https://www.amazon.de/dp/B07MFBLN7K/ref=sr_1_5?keywords=samsung+500gb+ssd&qid=1577491138&sr=8-5"
tracker.add_item(URL1, "Seagate 4TB HDD")
tracker.add_item(URL2, "Ryzen 3700x")
tracker.add_item(URL3, "500 GB Samsung SSD")

This item is already being tracked.
This item is already being tracked.
This item is already being tracked.


In [12]:
# tracker.deploy()

In [8]:
tracker.fetch_prices()

Fetching price for Seagate 4TB HDD.


ConnectionError: HTTPSConnectionPool(host='www.amazon.de', port=443): Max retries exceeded with url: /dp/B07H289S79 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f9c625a84a8>: Failed to establish a new connection: [Errno -2] Name or service not known'))

In [7]:
tracker.price_history

Unnamed: 0,year,month,day,hour,minute,Seagate 4TB HDD,Ryzen 3700x,500 GB Samsung SSD
0,2019,12,28,15,29,119.99,330.48,100.33
1,2019,12,28,17,8,119.99,330.48,100.33
2,2020,1,6,16,30,119.99,329.42,109.87
3,2020,1,28,23,17,119.99,339.0,111.8
4,2020,1,28,23,27,119.99,339.0,111.8
5,2020,1,28,23,28,119.99,339.0,111.8
