diff --git a/api.py b/api.py index 73f2237..6370541 100644 --- a/api.py +++ b/api.py @@ -97,7 +97,11 @@ def data_fetch_loop(): for webpage in webpages: print("Processing",webpage.id) try: - price = next(filter(lambda ext: ext.is_valid_url(webpage.url),extractors)).extract_data(webpage.url) + extractor = next(filter(lambda ext: ext.is_valid_url(webpage.url),extractors)) + if extractor is None: + print("No extractor found for", webpage.url) + continue + price = extractor.extract_data(webpage.url) obj = { "webpage_id": webpage.id, "url": webpage.url, @@ -114,7 +118,7 @@ def data_fetch_loop(): is_fetching = False -job = scheduler.add_job(data_fetch_loop, 'interval', seconds=180) +job = scheduler.add_job(data_fetch_loop, 'interval', seconds=60*5) scheduler.start() @@ -131,7 +135,7 @@ def wrapper(*args, **kwargs): else: return jsonify({"error": "Not authorized"}), 401 else: - func(*args, **kwargs) + return func(*args, **kwargs) return wrapper @api.before_request @@ -145,6 +149,12 @@ def syncThreads(): db_session.commit() # when your browser wants the data for the the site item + +@api.route('/force_refetch') +def force_resync(): + data_fetch_loop() + return "Gotcha!" + @api.route('/all_webpages') def everything(): return jsonify(db_session.query(Webpage).all()) diff --git a/database_sample_demo.db b/database_sample_demo.db new file mode 100644 index 0000000..a190bb4 Binary files /dev/null and b/database_sample_demo.db differ diff --git a/extractors/BestBuyExtractor.py b/extractors/BestBuyExtractor.py new file mode 100644 index 0000000..d64f12c --- /dev/null +++ b/extractors/BestBuyExtractor.py @@ -0,0 +1,18 @@ +from extractors.ExtractorBase import ExtractorBase + +from bs4 import BeautifulSoup + +class BestBuyExtractor(ExtractorBase): + def __init__(self): + super(BestBuyExtractor, self).__init__() + def extract_data(self,url): + ExtractorBase.extract_data(self,url) # superclass call + content = self.driver.page_source + soup = BeautifulSoup(content, 'lxml') + priceElem = next(soup.find('div', { + 'class': ['priceView-hero-price','priceView-customer-price'] + }).children) + return float(priceElem.getText().replace("$","")) + + def is_valid_url(self, url: str): + return url.startswith('https://www.bestbuy.com') or url.startswith('https://bestbuy.com') \ No newline at end of file diff --git a/extractors/ExtractorBase.py b/extractors/ExtractorBase.py index b8842f8..43f15c6 100644 --- a/extractors/ExtractorBase.py +++ b/extractors/ExtractorBase.py @@ -4,12 +4,36 @@ import os class ExtractorBase: + globalDriver = None + def __init__(self): + if ExtractorBase.globalDriver is not None: + self.driver = ExtractorBase.globalDriver + return if os.getenv('PHANTOMJS_LOCATION') is None: - self.driver = webdriver.Chrome(os.getenv('CHROMEDRIVER_LOCATION','/home/raymond/chromedriver')) + options = webdriver.ChromeOptions() + # options.add_argument("start-maximized") + options.add_experimental_option("excludeSwitches", ["enable-automation"]) + options.add_experimental_option('useAutomationExtension', False) + self.driver = webdriver.Chrome(os.getenv('CHROMEDRIVER_LOCATION','/home/raymond/chromedriver'), options = options) + try: + from selenium_stealth import stealth + stealth(self.driver, + languages=["en-US", "en"], + vendor="Google Inc.", + platform="Win32", + webgl_vendor="Google Inc. (NVIDIA Corporation)", + renderer="ANGLE (NVIDIA Corporation, NVIDIA GeForce GTX 1650/PCIe/SSE2, OpenGL 4.5.0 NVIDIA 470.74)", + fix_hairline=True + ) + except: + print("Selenium Stealth not found, this may increase your changes of being detected as a bot") + else: - self.driver = webdriver.PhantomJS(os.getenv('PHANTOMJS_LOCATION')) + self.driver = webdriver.PhantomJS(executable_path=os.getenv('PHANTOMJS_LOCATION')) self.driver.set_window_size(1920, 960) + # self.driver.set_page_load_timeout(60) + ExtractorBase.globalDriver = self.driver def extract_data(self, url): # return the relevant data for a url self.driver.get(url) diff --git a/extractors/GoogleShoppingExtractor..py b/extractors/GoogleShoppingExtractor..py deleted file mode 100644 index 2016f33..0000000 --- a/extractors/GoogleShoppingExtractor..py +++ /dev/null @@ -1,9 +0,0 @@ -from extractors.ExtractorBase import ExtractorBase - - -class GoogleShoppingExtractor(ExtractorBase): - def __init__(self): - pass - - def is_valid_url(self, url: str): - return url.startswith("https://google.com/") or url.startswith("https://www.google.com/") or url.startswith("https://shopping.google.com/") \ No newline at end of file diff --git a/extractors/Multiextractor.py b/extractors/Multiextractor.py index 8f97d97..dcccb65 100644 --- a/extractors/Multiextractor.py +++ b/extractors/Multiextractor.py @@ -8,7 +8,7 @@ def __init__(self): def extract_data(self,url): ExtractorBase.extract_data(self,url) # superclass call content = self.driver.page_source - soup = BeautifulSoup(content, 'html.parser') + soup = BeautifulSoup(content, 'lxml') if "amazon" in url: #for amazon # pw = soup.find('span', 'a-price-whole') @@ -19,7 +19,7 @@ def extract_data(self,url): }).children) #returns price print(priceElem.getText()) - return priceElem.getText().replace("$","")#pw.get_text() + pf.get_text() + return float(priceElem.getText().replace("$",""))#pw.get_text() + pf.get_text() elif "target" in url: #for Target @@ -28,7 +28,7 @@ def extract_data(self,url): }) #returns price only print(pw,pw.getText()) - return pw.getText().replace("$","") + return float(pw.getText().replace("$","")) elif "google" in url: #for google shopping # div class for airpod, mac, brands -> aULzUe IuHnof diff --git a/extractors/WalmartExtractor.py b/extractors/WalmartExtractor.py new file mode 100644 index 0000000..c6c6498 --- /dev/null +++ b/extractors/WalmartExtractor.py @@ -0,0 +1,18 @@ +from extractors.ExtractorBase import ExtractorBase + +from bs4 import BeautifulSoup + +class WalmartExtractor(ExtractorBase): + def __init__(self): + super(WalmartExtractor, self).__init__() + def extract_data(self,url): + ExtractorBase.extract_data(self,url) # superclass call + content = self.driver.page_source + soup = BeautifulSoup(content, 'lxml') + priceElem = soup.find('div', { + 'itemprop': "price" + }) + return float(priceElem.getText().replace("$","")) + + def is_valid_url(self, url: str): + return url.startswith('https://www.walmart.com') or url.startswith('https://walmart.com') \ No newline at end of file diff --git a/extractors/all_extractors.py b/extractors/all_extractors.py index 83741a7..3294205 100644 --- a/extractors/all_extractors.py +++ b/extractors/all_extractors.py @@ -1,5 +1,7 @@ from .Multiextractor import MultiExtractor -extractors = [MultiExtractor] +from .BestBuyExtractor import BestBuyExtractor +from .WalmartExtractor import WalmartExtractor +extractors = [MultiExtractor,BestBuyExtractor,WalmartExtractor] def create_instances(): instances = [] diff --git a/requirements.txt b/requirements.txt index 4250a45..27d703c 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,8 @@ Flask-Login python-dotenv snowflake-id selenium -apscheduler \ No newline at end of file +apscheduler +flask_admin +BeautifulSoup4 +cchardet +lxml \ No newline at end of file diff --git a/static/js/all_products.js b/static/js/all_products.js index b1f037b..919b2cb 100644 --- a/static/js/all_products.js +++ b/static/js/all_products.js @@ -1,7 +1,7 @@ function migrateIfNeeded(data) { if(!data.price){ data.price = "-1"; - }else if(data.price.includes("$")){ + }else if(typeof data.price === "string" && data.price.includes("$")){ data.price = data.price.replace("$", ""); } if(typeof data.price === "string"){ @@ -66,9 +66,14 @@ docReady(() => { prodLink.href = webpageInfo.url; prodLink.innerText = webpageInfo.url; dataDiv.appendChild(prodLink); - + let priceDiv = document.createElement("div"); priceDiv.className = "product-last-price"; + if(collectedWebpageData.length == 0){ + priceDiv.innerText = "No data collected yet. "; + dataDiv.appendChild(priceDiv); + return; + } priceDiv.innerText = "Last price: " + collectedWebpageData[0].price; dataDiv.appendChild(priceDiv); @@ -77,7 +82,7 @@ docReady(() => { priceGraph.id = "price-graph-" + product.id + "-" + webpage.id; priceGraph.width = "800"; priceGraph.height = "600"; - + // maxmin let arrMax = collectedWebpageData.map(s => s.price).reduce((a,b) => Math.max(a,b)); let arrMin = collectedWebpageData.map(s => s.price).reduce((a,b) => Math.min(a,b)); @@ -137,9 +142,11 @@ docReady(() => { dataDiv.className = "product-all-data"; })); }catch(ex){ + console.log(ex); set_alert_danger("Error" + ex); } }).catch(err => { + console.log(err); set_alert_danger("Error" + err); }); }); \ No newline at end of file diff --git a/templates/layouts/main.html b/templates/layouts/main.html index c5a2ad8..b2e95d8 100644 --- a/templates/layouts/main.html +++ b/templates/layouts/main.html @@ -48,6 +48,9 @@