Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ def data_fetch_loop():
for webpage in webpages:
print("Processing",webpage.id)
try:
price = next(filter(lambda ext: ext.is_valid_url(webpage.url),extractors)).extract_data(webpage.url)
extractor = next(filter(lambda ext: ext.is_valid_url(webpage.url),extractors))
if extractor is None:
print("No extractor found for", webpage.url)
continue
price = extractor.extract_data(webpage.url)
obj = {
"webpage_id": webpage.id,
"url": webpage.url,
Expand All @@ -114,7 +118,7 @@ def data_fetch_loop():

is_fetching = False

job = scheduler.add_job(data_fetch_loop, 'interval', seconds=180)
job = scheduler.add_job(data_fetch_loop, 'interval', seconds=60*5)

scheduler.start()

Expand All @@ -131,7 +135,7 @@ def wrapper(*args, **kwargs):
else:
return jsonify({"error": "Not authorized"}), 401
else:
func(*args, **kwargs)
return func(*args, **kwargs)
return wrapper

@api.before_request
Expand All @@ -145,6 +149,12 @@ def syncThreads():
db_session.commit()

# when your browser wants the data for the the site item

@api.route('/force_refetch')
def force_resync():
data_fetch_loop()
return "Gotcha!"

@api.route('/all_webpages')
def everything():
return jsonify(db_session.query(Webpage).all())
Expand Down
Binary file added database_sample_demo.db
Binary file not shown.
18 changes: 18 additions & 0 deletions extractors/BestBuyExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from extractors.ExtractorBase import ExtractorBase

from bs4 import BeautifulSoup

class BestBuyExtractor(ExtractorBase):
def __init__(self):
super(BestBuyExtractor, self).__init__()
def extract_data(self,url):
ExtractorBase.extract_data(self,url) # superclass call
content = self.driver.page_source
soup = BeautifulSoup(content, 'lxml')
priceElem = next(soup.find('div', {
'class': ['priceView-hero-price','priceView-customer-price']
}).children)
return float(priceElem.getText().replace("$",""))

def is_valid_url(self, url: str):
return url.startswith('https://www.bestbuy.com') or url.startswith('https://bestbuy.com')
28 changes: 26 additions & 2 deletions extractors/ExtractorBase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,36 @@

import os
class ExtractorBase:
globalDriver = None

def __init__(self):
if ExtractorBase.globalDriver is not None:
self.driver = ExtractorBase.globalDriver
return
if os.getenv('PHANTOMJS_LOCATION') is None:
self.driver = webdriver.Chrome(os.getenv('CHROMEDRIVER_LOCATION','/home/raymond/chromedriver'))
options = webdriver.ChromeOptions()
# options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
self.driver = webdriver.Chrome(os.getenv('CHROMEDRIVER_LOCATION','/home/raymond/chromedriver'), options = options)
try:
from selenium_stealth import stealth
stealth(self.driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Google Inc. (NVIDIA Corporation)",
renderer="ANGLE (NVIDIA Corporation, NVIDIA GeForce GTX 1650/PCIe/SSE2, OpenGL 4.5.0 NVIDIA 470.74)",
fix_hairline=True
)
except:
print("Selenium Stealth not found, this may increase your changes of being detected as a bot")

else:
self.driver = webdriver.PhantomJS(os.getenv('PHANTOMJS_LOCATION'))
self.driver = webdriver.PhantomJS(executable_path=os.getenv('PHANTOMJS_LOCATION'))
self.driver.set_window_size(1920, 960)
# self.driver.set_page_load_timeout(60)
ExtractorBase.globalDriver = self.driver
def extract_data(self, url):
# return the relevant data for a url
self.driver.get(url)
Expand Down
9 changes: 0 additions & 9 deletions extractors/GoogleShoppingExtractor..py

This file was deleted.

6 changes: 3 additions & 3 deletions extractors/Multiextractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(self):
def extract_data(self,url):
ExtractorBase.extract_data(self,url) # superclass call
content = self.driver.page_source
soup = BeautifulSoup(content, 'html.parser')
soup = BeautifulSoup(content, 'lxml')
if "amazon" in url:
#for amazon
# pw = soup.find('span', 'a-price-whole')
Expand All @@ -19,7 +19,7 @@ def extract_data(self,url):
}).children)
#returns price
print(priceElem.getText())
return priceElem.getText().replace("$","")#pw.get_text() + pf.get_text()
return float(priceElem.getText().replace("$",""))#pw.get_text() + pf.get_text()

elif "target" in url:
#for Target
Expand All @@ -28,7 +28,7 @@ def extract_data(self,url):
})
#returns price only
print(pw,pw.getText())
return pw.getText().replace("$","")
return float(pw.getText().replace("$",""))
elif "google" in url:
#for google shopping
# div class for airpod, mac, brands -> aULzUe IuHnof
Expand Down
18 changes: 18 additions & 0 deletions extractors/WalmartExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from extractors.ExtractorBase import ExtractorBase

from bs4 import BeautifulSoup

class WalmartExtractor(ExtractorBase):
def __init__(self):
super(WalmartExtractor, self).__init__()
def extract_data(self,url):
ExtractorBase.extract_data(self,url) # superclass call
content = self.driver.page_source
soup = BeautifulSoup(content, 'lxml')
priceElem = soup.find('div', {
'itemprop': "price"
})
return float(priceElem.getText().replace("$",""))

def is_valid_url(self, url: str):
return url.startswith('https://www.walmart.com') or url.startswith('https://walmart.com')
4 changes: 3 additions & 1 deletion extractors/all_extractors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .Multiextractor import MultiExtractor
extractors = [MultiExtractor]
from .BestBuyExtractor import BestBuyExtractor
from .WalmartExtractor import WalmartExtractor
extractors = [MultiExtractor,BestBuyExtractor,WalmartExtractor]

def create_instances():
instances = []
Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,8 @@ Flask-Login
python-dotenv
snowflake-id
selenium
apscheduler
apscheduler
flask_admin
BeautifulSoup4
cchardet
lxml
13 changes: 10 additions & 3 deletions static/js/all_products.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
function migrateIfNeeded(data) {
if(!data.price){
data.price = "-1";
}else if(data.price.includes("$")){
}else if(typeof data.price === "string" && data.price.includes("$")){
data.price = data.price.replace("$", "");
}
if(typeof data.price === "string"){
Expand Down Expand Up @@ -66,9 +66,14 @@ docReady(() => {
prodLink.href = webpageInfo.url;
prodLink.innerText = webpageInfo.url;
dataDiv.appendChild(prodLink);

let priceDiv = document.createElement("div");
priceDiv.className = "product-last-price";
if(collectedWebpageData.length == 0){
priceDiv.innerText = "No data collected yet. ";
dataDiv.appendChild(priceDiv);
return;
}
priceDiv.innerText = "Last price: " + collectedWebpageData[0].price;
dataDiv.appendChild(priceDiv);

Expand All @@ -77,7 +82,7 @@ docReady(() => {
priceGraph.id = "price-graph-" + product.id + "-" + webpage.id;
priceGraph.width = "800";
priceGraph.height = "600";

// maxmin
let arrMax = collectedWebpageData.map(s => s.price).reduce((a,b) => Math.max(a,b));
let arrMin = collectedWebpageData.map(s => s.price).reduce((a,b) => Math.min(a,b));
Expand Down Expand Up @@ -137,9 +142,11 @@ docReady(() => {
dataDiv.className = "product-all-data";
}));
}catch(ex){
console.log(ex);
set_alert_danger("Error" + ex);
}
}).catch(err => {
console.log(err);
set_alert_danger("Error" + err);
});
});
4 changes: 4 additions & 0 deletions templates/layouts/main.html
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
<li class="nav-item active">
<a class="nav-link" href="/">Home</a>
</li>
<li class="nav-item">
<a class="nav-link" href={{ url_for('products') }}>Product Listing</a>
</li>
<li class="nav-item">
<a class="nav-link" href={{ url_for('create_product') }}>Create Product</a>
</li>
Expand All @@ -57,6 +60,7 @@
</a>
<div class="dropdown-menu" aria-labelledby="navbarDropdown">
<a class="dropdown-item" href={{ url_for('about' )}}>About</a>
<a class="dropdown-item" href={{ url_for('add_webpage' )}}>Add Webpage</a>
</div>
</li>
</ul>
Expand Down
2 changes: 1 addition & 1 deletion templates/pages/products.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% extends 'layouts/main.html' %}
{% block title %}Add Webpage{% endblock %}
{% block title %}Product Status{% endblock %}
{% block content %}
<script src="https://cdn.jsdelivr.net/npm/chart.js@3.6.0/dist/chart.min.js" integrity="sha256-7lWo7cjrrponRJcS6bc8isfsPDwSKoaYfGIHgSheQkk=" crossorigin="anonymous"></script>
<script src="/static/js/all_products.js"></script>
Expand Down