You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
def scrape_website_content(self, website_url, failed_sites=[]):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.google.com/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Accept-Encoding': 'gzip, deflate, br'
}
def is_garbled(text):
# Count non-ASCII characters
non_ascii_chars = sum(1 for char in text if char not in string.printable)
try:
# Calculate the proportion of non-ASCII characters
return non_ascii_chars / len(text) > 0.2
except ZeroDivisionError:
# If the text is empty, it cannot be garbled
return False
try:
# Making a GET request to the website
response = requests.get(website_url, headers=headers, timeout=15)
response.raise_for_status() # This will raise an exception for HTTP errors
# Detecting encoding using chardet
detected_encoding = chardet.detect(response.content)
response.encoding = detected_encoding['encoding'] if detected_encoding['confidence'] > 0.5 else 'utf-8'
# Handling possible issues with encoding detection
try:
content = response.text
except UnicodeDecodeError:
content = response.content.decode('utf-8', errors='replace')
# Parsing the page content using BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text(separator='\n')
# Cleaning up the text: removing excess whitespace
clean_text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
split_text = clean_text.split()
first_5k_words = split_text[:5000]
clean_text_5k = ' '.join(first_5k_words)
if is_garbled(clean_text):
print(f"Failed to retrieve content from {website_url} due to garbled text.")
failed = {"source": website_url, "content": "Failed to retrieve content due to garbled text"}
failed_sites.append(website_url)
return failed, failed_sites, False
return {"source": website_url, "content": clean_text_5k}, "N/A", True
My modification:
def scrape_site_jina(self, website_url, failed_sites=[]):
prefixurl="https://r.jina.ai/"
response = requests.get(prefixurl+website_url)
if response.status_code == 200:
print(response.text)
return {"source": website_url, "content":response.text[0:20*1000]}, "N/A", True
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
failed = {"source": website_url, "content": "Failed to retrieve content due to an error: "}
failed_sites.append(website_url)
return failed, failed_sites, False
Current code :
My modification:
Additional links for the jina ai reader api :
https://jina.ai/reader/#demo
The text was updated successfully, but these errors were encountered: