In [1]:
from bs4 import BeautifulSoup
import urllib.request
from urllib.request import HTTPError # for catching timeout for website response
import numpy as np
import pandas as pd
from pandas.errors import EmptyDataError # error produced if empty csv if parsed
import time # for sleep function
from datetime import datetime # for timestamp
import matplotlib.pyplot as plt # for plotting price data

In [6]:
class Tracker:
    
    def __init__(self):
        self.items = {"nicknames": [], "names": [], "urls": []}
        self.price_history = {}
        self.__retrieve_items()
        
        DateTime = ["year", "month", "day", "hour", "minute"]
        self.price_history = pd.DataFrame(columns=DateTime+self.items["nicknames"])
        
        self.__retrieve_price_hist()

    def add_item(self, URL, nickname):
        if URL not in self.items["urls"] and nickname not in self.items["nicknames"]:
            try:
                with urllib.request.urlopen(URL) as f:
                    html_doc = f.read()

                soup = BeautifulSoup(html_doc, 'html.parser')

                # extract name
                for element in soup.find_all("span"):
                    if "productTitle" in str(element):
                        title_containing_str = str(element)
                        break
                title_containing_str_start = title_containing_str.find(">")+1
                title_containing_str_end = title_containing_str.find("</")
                title = title_containing_str[title_containing_str_start:title_containing_str_end].replace("\n", "").replace("  ", "")
                
                # save title and URL to txt
                f = open("tracked_items.txt","a", newline="\n")
                if title not in self.items["names"]:
                    f.write(nickname + " : " + title + " : " + URL + "\n")
                f.close()

                # save title and URL to dict
                if title not in self.items["names"]:
                    self.items["names"].append(title)
                    self.items["urls"].append(URL)
                    self.items["nicknames"].append(nickname)

            except HTTPError:
                print("HTTP 503 Error, try to add item again later.")
        else:
            print("This item is already being tracked.")
            

            
    def __retrieve_items(self):
        # retrieve tracked items
        try:
            f = open("tracked_items.txt", "r")
            if f.read() == "":
                print("No items are being tracked so far. Please add an item to be tracked using .add_item().")
                f.close()
            else:
                f = open("tracked_items.txt", "r")
                lines = f.readlines()
                for line in lines:
                    nickname, title, url = line.split(" : ")
                    if title not in self.items["names"]:
                        self.items["names"].append(title)
                        self.items["urls"].append(url)
                        self.items["nicknames"].append(nickname)
            f.close()
        except FileNotFoundError:
            open("tracked_items.txt", "x")
    
    def __retrieve_price_hist(self):
        try:
            self.price_history = pd.read_csv("price_history.csv")
        except FileNotFoundError:
            open("price_history.csv", "x")
        except EmptyDataError:
            if len(self.items["names"]) > 0:
                print("The price history is empty so far. Please fetch prices using .fetch_prices() first.")
            else:
                pass
        
        
        
    def wipe_database(self):
        # delete contents of files
        items = open("tracked_items.txt", "w")
        items.write("")
        items.close()
        
        hist = open("price_history.csv", "w")
        hist.write("")
        hist.close()
        
    def fetch_prices(self):  
        # extract price
        tries = 0
        max_tries = 10
        delay = 1 # delay between fetching items in s
        DateTime = ["year", "month", "day", "hour", "minute"]
        new_row = pd.DataFrame(columns=DateTime+self.items["nicknames"])
        while tries < max_tries:
            try:
                if len(self.items["names"]) > 0:
                    for n in range(len(self.items["urls"])):
                        URL = self.items["urls"][n]
                        try:
                            with urllib.request.urlopen(URL) as f:
                                html_doc = f.read()

                            soup = BeautifulSoup(html_doc, 'html.parser')
                            time.sleep(delay)

                            for element in soup.find_all("span"):
                                if "a-color-price price" in str(element):
                                    price_containing_str = str(element)
                                    break

                            price_str = ''.join([i for i in list(price_containing_str) if i.isdigit()])
                            price_str = price_str[:-2] + "." + price_str[-2:]
                            price = float(price_str)
                            item_name = self.items["nicknames"][n]
                            new_row[item_name] = [price]
                        except HTTPError:
                            item_name = self.items["nicknames"][n]
                            new_row[item_name] = [np.NaN]
                else:
                    print("There is no items to fetch a price for. Please add items using .add_item() first.")
                    break
                    
                now = datetime.now()
                datetime_vec = now.timetuple()[0:5]
                new_row[DateTime] = datetime_vec
                new_row.index = range(self.price_history.shape[0],self.price_history.shape[0]+1)
                self.price_history = self.price_history.append(new_row, sort=False)
                
                # save price history
                self.price_history.to_csv("price_history.csv", index_label=False)
                break

            except HTTPError:
                print("HTTP 503 Error, trying again in 5 minutes.")
                time.sleep(5*60)
                tries += 1
                
    def remove_item(self):
        print("The items currently being tracked are: \n")
        for i in range(len(self.items["nicknames"])):
            print("[" + str(i) + "] --> " + self.items["nicknames"][i])
        Input = input("\nTo remove an item from tracking enter the corresponding number.\nTo cancel, press 'Enter' ")
        if Input.isdigit():
            item2delete_idx = int(Input)
            if item2delete_idx < len(self.items["nicknames"]):
                item_name = self.items["nicknames"][item2delete_idx]

                # remove from hist
                self.price_history = self.price_history.drop(item_name, axis=1)
                
                # remove from tracked items
                self.items["names"].pop(item2delete_idx)
                self.items["nicknames"].pop(item2delete_idx)
                self.items["urls"].pop(item2delete_idx)
                
                # remove from corresponding .txt and .csv
                f_read = open("tracked_items.txt", "r")
                lines = f_read.readlines()
                lines.pop(item2delete_idx)
                f_write = open("tracked_items.txt", "w")
                f_write.write("".join(lines))
                f_read.close()
                f_write.close()
                
                self.price_history.to_csv("price_history.csv", index_label=False)
                
                
                
                
                print("Item was removed.")
            else:
                print("The input does not correspond to an item.")
        elif Input == "":
            print("The action has been canceled.")
        else:
            print("The input is not valid.")
        

    def plot_prices(self, timescale="day"):
        fig = plt.figure(figsize=(10,6))
        time_axis = self.price_history[timescale]
        tracked_items = list(tracker.price_history.columns)[5:]
        for item in tracked_items:
            plt.plot(time_axis,tracker.price_history[item], "-o" , label=item)
        
        plt.legend()
        plt.grid()
        plt.xlabel(timescale + "s")
        plt.ylabel("Price in €")
        plt.show()

#     def deploy(self):
#         loop  once every day, if not looping ask for items to add
        
    
# add functionality to see how many items are left in stock if possible!!!

In [7]:
tracker = Tracker()

The price history is empty so far. Please fetch prices using .fetch_prices() first.


In [4]:
URL1 = "https://www.amazon.de/dp/B07H289S79/ref=sr_1_1_sspa?crid=10NNT7QDBOWO4&keywords=4+TB+Seagate&qid=1576506626&sprefix=seagate+%2Caps%2C182&sr=8-1-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEzMFZLNUFTSk5VQ0xFJmVuY3J5cHRlZElkPUEwNTEyOTI2M01OWldDVDI4SkYxMCZlbmNyeXB0ZWRBZElkPUEwNDkxOTQ0MVM0UVZGR0hKWkZUUSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU="
URL2 = "https://www.amazon.de/gp/product/B07SXMZLPK/ref=ox_sc_saved_title_3?smid=A133RA3ZUAU4I7&psc=1"
URL3 = "https://www.amazon.de/gp/product/B07MFBLN7K/ref=ox_sc_saved_title_4?smid=A1NW99WOC7UQ45&psc=1"
tracker.add_item(URL1, "Seagate 4TB HDD")
tracker.add_item(URL2, "Ryzen 3700x")
tracker.add_item(URL3, "500 GB Samsung SSD")

In [12]:
# tracker.fetch_prices()
print(tracker.price_history)

   year month day hour minute  Seagate 4TB HDD  500 GB Samsung SSD
0  2019    12  16   23     53           109.99              117.63
