Skip to content

Commit

Permalink
adjust extract_photo_link to make it possible to extract more than 4 …
Browse files Browse the repository at this point in the history
…images
  • Loading branch information
neon-ninja committed Apr 29, 2021
1 parent 5eeb8ee commit 7f2305e
Showing 1 changed file with 36 additions and 19 deletions.
55 changes: 36 additions & 19 deletions facebook_scraper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from . import utils
from .constants import FB_BASE_URL, FB_MOBILE_BASE_URL
from .fb_types import Options, Post, RawPost, RequestFunction
from .fb_types import Options, Post, RawPost, RequestFunction, Response, URL


try:
Expand Down Expand Up @@ -354,30 +354,47 @@ def extract_shares(self) -> PartialPost:
or 0,
}

def extract_photo_link_HQ(self, response: Response) -> URL:
match = self.image_regex.search(response.text)
if match:
url = match.groups()[0].replace("&", "&")
if not url.startswith("http"):
url = utils.urljoin(FB_MOBILE_BASE_URL, url)
if url.startswith(utils.urljoin(FB_MOBILE_BASE_URL, "/photo/view_full_size/")):
# Try resolve redirect
logger.debug(f"Fetching {url}")
redirect_response = self.request(url)
if not redirect_response.url.startswith(utils.urljoin(FB_MOBILE_BASE_URL, "login.php")):
url = redirect_response.html.find("a", first=True).attrs.get("href").replace("&", "&")
return url
else:
return None

def extract_photo_link(self) -> PartialPost:
if not self.options.get("allow_extra_requests", True):
return None
images = []
matches = list(self.photo_link.finditer(self.element.html))
if not matches:
matches = self.photo_link_2.finditer(self.element.html)

for match in matches:
url = utils.urljoin(FB_MOBILE_BASE_URL, match.groups()[0])
photo_links = self.element.find("a[href*='photo.php'],a[href*='/photos/']")
total_photos_in_gallery = len(photo_links)
if len(photo_links) == 4 and photo_links[-1].text:
total_photos_in_gallery = 4 + int(photo_links[-1].text.strip("+"))
logger.debug(f"{total_photos_in_gallery} total photos in gallery")

# This gets up to 4 images in gallery
for link in photo_links:
url = utils.urljoin(FB_MOBILE_BASE_URL, link.attrs["href"])
logger.debug(f"Fetching {url}")
response = self.request(url)
images.append(self.extract_photo_link_HQ(response))

while len(images) < total_photos_in_gallery:
# More photos to fetch. Follow the right arrow link of the last image we were on
url = response.html.find('a.touchable[data-gt=\'{"tn":"+="}\']', first=True).attrs["href"]
if not url.startswith("http"):
url = utils.urljoin(FB_MOBILE_BASE_URL, url)
logger.debug(f"Fetching {url}")
response = self.request(url)
html = response.text
match = self.image_regex.search(html)
if match:
url = match.groups()[0].replace("&amp;", "&")
if not url.startswith("http"):
url = utils.urljoin(FB_MOBILE_BASE_URL, url)
if url.startswith(utils.urljoin(FB_MOBILE_BASE_URL, "/photo/view_full_size/")):
# Try resolve redirect
response = self.request(url)
if not response.url.startswith(utils.urljoin(FB_MOBILE_BASE_URL, "login.php")):
url = response.html.find("a", first=True).attrs.get("href").replace("&amp;", "&")
images.append(url)
images.append(self.extract_photo_link_HQ(response))
image = images[0] if images else None
return {"image": image, "images": images}

Expand Down

1 comment on commit 7f2305e

@gaoyunzhi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@neon-ninja Thank you!

Please sign in to comment.