From 15c3c64ad8c0c92f07b8e736334ca88c0a3e9d32 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Thu, 5 Nov 2020 07:45:34 +0200 Subject: [PATCH 01/10] Added fallback time extraction engine --- facebook_scraper/extractors.py | 14 +++++++++++++- requirements-dev.txt | 1 + requirements.txt | 4 ++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 69a90050..7a32ba87 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -6,6 +6,8 @@ from json import JSONDecodeError from typing import Any, Dict, Optional +import dateparser + from . import utils from .constants import FB_BASE_URL, FB_MOBILE_BASE_URL from .fb_types import RawPost, Options, Post, RequestFunction @@ -38,6 +40,7 @@ class PostExtractor: comments_regex = re.compile(r'cmt_def[^>]*>([0-9,.]+)') shares_regex = re.compile(r'([0-9,.]+)\s+Shares', re.IGNORECASE) link_regex = re.compile(r"href=\"https:\/\/lm\.facebook\.com\/l\.php\?u=(.+?)\&h=") + time_regex = re.compile(r"(\w+ \d{2} at \d{2}:\d{2} (AM|PM))|(\d{1,2} \w+)") photo_link = re.compile(r'href=\"(/[^\"]+/photos/[^\"]+?)\"') image_regex = re.compile( @@ -203,7 +206,16 @@ def extract_time(self) -> PartialPost: except (KeyError, ValueError): continue - return None + try: + time_match = self.time_regex.search(self.element.full_text) + if time_match: + time = time_match.group(0) + return { + # 'time': datetime.strptime(time, '%B %d at %I:%M %p').replace(year=datetime.now().year), + 'time': dateparser.parse(time) + } + except: + return None def extract_user_id(self) -> PartialPost: return {'user_id': self.data_ft['content_owner_id_new']} diff --git a/requirements-dev.txt b/requirements-dev.txt index 9fb770ed..0541b2df 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -223,3 +223,4 @@ yarl==1.4.2; python_version >= "3.6" \ zipp==3.1.0; python_version < "3.8" \ --hash=sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b \ --hash=sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96 +dateparser~=1.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8d7492fd..60339a34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -92,3 +92,7 @@ websockets==8.0.2 \ --hash=sha256:f5cb2683367e32da6a256b60929a3af9c29c212b5091cf5bace9358d03011bf5 \ --hash=sha256:049e694abe33f8a1d99969fee7bfc0ae6761f7fd5f297c58ea933b27dd6805f2 \ --hash=sha256:882a7266fa867a2ebb2c0baaa0f9159cabf131cf18c1b4270d79ad42f9208dc5 + +html2text~=2020.1.16 +requests~=2.24.0 +dateparser~=1.0.0 \ No newline at end of file From b04c5ac15502d7dfb1bde76db08b0601ac7e1788 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Thu, 5 Nov 2020 07:51:18 +0200 Subject: [PATCH 02/10] Return none without an exception failure in extract_time --- facebook_scraper/extractors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 7a32ba87..62061b34 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -211,12 +211,13 @@ def extract_time(self) -> PartialPost: if time_match: time = time_match.group(0) return { - # 'time': datetime.strptime(time, '%B %d at %I:%M %p').replace(year=datetime.now().year), 'time': dateparser.parse(time) } except: return None + return None + def extract_user_id(self) -> PartialPost: return {'user_id': self.data_ft['content_owner_id_new']} From 72373efdcb2bf9b9c88623851efadf001e3fc8e3 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Fri, 6 Nov 2020 11:37:19 +0200 Subject: [PATCH 03/10] Integrated more comprehensive datetime regex --- facebook_scraper/extractors.py | 10 +++------- facebook_scraper/utils.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 62061b34..2c6df43f 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -6,8 +6,6 @@ from json import JSONDecodeError from typing import Any, Dict, Optional -import dateparser - from . import utils from .constants import FB_BASE_URL, FB_MOBILE_BASE_URL from .fb_types import RawPost, Options, Post, RequestFunction @@ -40,7 +38,6 @@ class PostExtractor: comments_regex = re.compile(r'cmt_def[^>]*>([0-9,.]+)') shares_regex = re.compile(r'([0-9,.]+)\s+Shares', re.IGNORECASE) link_regex = re.compile(r"href=\"https:\/\/lm\.facebook\.com\/l\.php\?u=(.+?)\&h=") - time_regex = re.compile(r"(\w+ \d{2} at \d{2}:\d{2} (AM|PM))|(\d{1,2} \w+)") photo_link = re.compile(r'href=\"(/[^\"]+/photos/[^\"]+?)\"') image_regex = re.compile( @@ -207,11 +204,10 @@ def extract_time(self) -> PartialPost: continue try: - time_match = self.time_regex.search(self.element.full_text) - if time_match: - time = time_match.group(0) + date = utils.parse_date(element_full_text=self.element.full_text) + if date: return { - 'time': dateparser.parse(time) + 'time': date } except: return None diff --git a/facebook_scraper/utils.py b/facebook_scraper/utils.py index f13f3e3d..d779a020 100644 --- a/facebook_scraper/utils.py +++ b/facebook_scraper/utils.py @@ -1,7 +1,10 @@ import codecs import re +from datetime import datetime +from typing import Optional from urllib.parse import parse_qsl, unquote, urlencode, urljoin, urlparse, urlunparse +import dateparser from html2text import html2text as _html2text from requests_html import DEFAULT_URL, Element, PyQuery @@ -43,3 +46,35 @@ def make_html_element(html: str, url=DEFAULT_URL) -> Element: def html2text(html: str) -> str: return _html2text(html) + + +date = r"Jan(?:uary)?|" \ + r"Feb(?:ruary)?|" \ + r"Mar(?:ch)?|" \ + r"Apr(?:il)?|" \ + r"May|" \ + r"Jun(?:e)?|" \ + r"Jul(?:y)?|" \ + r"Aug(?:ust)?|" \ + r"Sep(?:tember)?|" \ + r"Oct(?:ober)?|" \ + r"Nov(?:ember)?|" \ + r"Dec(?:ember)?|" \ + r"Yesterday|" \ + r"Today" +hour = r"\d{1,2}" +minute = r"\d{2}" +period = r"AM|PM" +exact_time = fr"({date}) at {hour}:{minute} ({period})" +relative_time = r"\d{1,2} \w+" + +time_regex = re.compile(fr"({exact_time}|{relative_time})") + + +def parse_date(element_full_text: str) -> Optional[datetime]: + time_match = time_regex.search(element_full_text) + if time_match: + time = time_match.group(0) + return dateparser.parse(time) + else: + return None From 6b12045fd2c4044a172552dedb9a27e029f0e211 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sat, 7 Nov 2020 21:42:11 +0200 Subject: [PATCH 04/10] Update facebook_scraper/extractors.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kevin Zúñiga --- facebook_scraper/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 2c6df43f..8c633e5a 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -209,7 +209,7 @@ def extract_time(self) -> PartialPost: return { 'time': date } - except: + except Exception: return None return None From ed936cf73b677a8cb5c60cabe80cc1e5c4633495 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sat, 7 Nov 2020 22:01:57 +0200 Subject: [PATCH 05/10] Added day digit to month regex --- facebook_scraper/utils.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/facebook_scraper/utils.py b/facebook_scraper/utils.py index d779a020..e5067cfb 100644 --- a/facebook_scraper/utils.py +++ b/facebook_scraper/utils.py @@ -48,24 +48,25 @@ def html2text(html: str) -> str: return _html2text(html) -date = r"Jan(?:uary)?|" \ - r"Feb(?:ruary)?|" \ - r"Mar(?:ch)?|" \ - r"Apr(?:il)?|" \ - r"May|" \ - r"Jun(?:e)?|" \ - r"Jul(?:y)?|" \ - r"Aug(?:ust)?|" \ - r"Sep(?:tember)?|" \ - r"Oct(?:ober)?|" \ - r"Nov(?:ember)?|" \ - r"Dec(?:ember)?|" \ - r"Yesterday|" \ - r"Today" +month = r"Jan(?:uary)?|" \ + r"Feb(?:ruary)?|" \ + r"Mar(?:ch)?|" \ + r"Apr(?:il)?|" \ + r"May|" \ + r"Jun(?:e)?|" \ + r"Jul(?:y)?|" \ + r"Aug(?:ust)?|" \ + r"Sep(?:tember)?|" \ + r"Oct(?:ober)?|" \ + r"Nov(?:ember)?|" \ + r"Dec(?:ember)?|" \ + r"Yesterday|" \ + r"Today" +date = f"({month}) " + r"\d{1,2}" hour = r"\d{1,2}" minute = r"\d{2}" period = r"AM|PM" -exact_time = fr"({date}) at {hour}:{minute} ({period})" +exact_time = f"({date}) at {hour}:{minute} ({period})" relative_time = r"\d{1,2} \w+" time_regex = re.compile(fr"({exact_time}|{relative_time})") From c67238ab3a5548c785cb5e87996b8f260d553299 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sat, 7 Nov 2020 22:03:59 +0200 Subject: [PATCH 06/10] Disable date parsing catch (caught in extract_post) --- facebook_scraper/extractors.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 2c6df43f..2e0e5686 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -203,14 +203,11 @@ def extract_time(self) -> PartialPost: except (KeyError, ValueError): continue - try: - date = utils.parse_date(element_full_text=self.element.full_text) - if date: - return { - 'time': date - } - except: - return None + date = utils.parse_date(element_full_text=self.element.full_text) + if date: + return { + 'time': date + } return None From f866e378fe096bd34cd1a71908a8ae90a4d9e907 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sat, 7 Nov 2020 22:04:25 +0200 Subject: [PATCH 07/10] time -> datetime --- facebook_scraper/extractors.py | 2 +- facebook_scraper/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/facebook_scraper/extractors.py b/facebook_scraper/extractors.py index 2e0e5686..c2a23d93 100644 --- a/facebook_scraper/extractors.py +++ b/facebook_scraper/extractors.py @@ -203,7 +203,7 @@ def extract_time(self) -> PartialPost: except (KeyError, ValueError): continue - date = utils.parse_date(element_full_text=self.element.full_text) + date = utils.parse_datetime(element_full_text=self.element.full_text) if date: return { 'time': date diff --git a/facebook_scraper/utils.py b/facebook_scraper/utils.py index e5067cfb..8bbea3ae 100644 --- a/facebook_scraper/utils.py +++ b/facebook_scraper/utils.py @@ -69,11 +69,11 @@ def html2text(html: str) -> str: exact_time = f"({date}) at {hour}:{minute} ({period})" relative_time = r"\d{1,2} \w+" -time_regex = re.compile(fr"({exact_time}|{relative_time})") +datetime_regex = re.compile(fr"({exact_time}|{relative_time})") -def parse_date(element_full_text: str) -> Optional[datetime]: - time_match = time_regex.search(element_full_text) +def parse_datetime(element_full_text: str) -> Optional[datetime]: + time_match = datetime_regex.search(element_full_text) if time_match: time = time_match.group(0) return dateparser.parse(time) From 2b6e7c13bdc027c513099225aa9f65180ce03be6 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sun, 8 Nov 2020 21:54:58 +0200 Subject: [PATCH 08/10] Update facebook_scraper/utils.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kevin Zúñiga --- facebook_scraper/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook_scraper/utils.py b/facebook_scraper/utils.py index 8bbea3ae..7f5b3480 100644 --- a/facebook_scraper/utils.py +++ b/facebook_scraper/utils.py @@ -67,7 +67,7 @@ def html2text(html: str) -> str: minute = r"\d{2}" period = r"AM|PM" exact_time = f"({date}) at {hour}:{minute} ({period})" -relative_time = r"\d{1,2} \w+" +relative_time = r"\b\d{1,2}(?:h| hrs)" datetime_regex = re.compile(fr"({exact_time}|{relative_time})") From a25206583f289c759b035b3676e05e4193f3a432 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sun, 8 Nov 2020 21:55:51 +0200 Subject: [PATCH 09/10] Month capture is optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kevin Zúñiga --- facebook_scraper/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook_scraper/utils.py b/facebook_scraper/utils.py index 7f5b3480..cfbd04df 100644 --- a/facebook_scraper/utils.py +++ b/facebook_scraper/utils.py @@ -62,7 +62,7 @@ def html2text(html: str) -> str: r"Dec(?:ember)?|" \ r"Yesterday|" \ r"Today" -date = f"({month}) " + r"\d{1,2}" +date = f"(?:{month}) " + r"\d{1,2}" + r"(?:, \d{4})?" hour = r"\d{1,2}" minute = r"\d{2}" period = r"AM|PM" From 35e6eac471e188c557a3fa9c5a1ebd106e39e650 Mon Sep 17 00:00:00 2001 From: TheMulti0 Date: Sun, 8 Nov 2020 21:56:05 +0200 Subject: [PATCH 10/10] Date and period capture are optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kevin Zúñiga --- facebook_scraper/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook_scraper/utils.py b/facebook_scraper/utils.py index cfbd04df..a677e4d8 100644 --- a/facebook_scraper/utils.py +++ b/facebook_scraper/utils.py @@ -66,7 +66,7 @@ def html2text(html: str) -> str: hour = r"\d{1,2}" minute = r"\d{2}" period = r"AM|PM" -exact_time = f"({date}) at {hour}:{minute} ({period})" +exact_time = f"(?:{date}) at {hour}:{minute} (?:{period})" relative_time = r"\b\d{1,2}(?:h| hrs)" datetime_regex = re.compile(fr"({exact_time}|{relative_time})")