Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added fallback time extraction engine #135

Merged
merged 11 commits into from
Nov 8, 2020
6 changes: 6 additions & 0 deletions facebook_scraper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ def extract_time(self) -> PartialPost:
except (KeyError, ValueError):
continue

date = utils.parse_datetime(element_full_text=self.element.full_text)
kevinzg marked this conversation as resolved.
Show resolved Hide resolved
if date:
return {
'time': date
}

return None

def extract_user_id(self) -> PartialPost:
Expand Down
36 changes: 36 additions & 0 deletions facebook_scraper/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import codecs
import re
from datetime import datetime
from typing import Optional
from urllib.parse import parse_qsl, unquote, urlencode, urljoin, urlparse, urlunparse

import dateparser
from html2text import html2text as _html2text
from requests_html import DEFAULT_URL, Element, PyQuery

Expand Down Expand Up @@ -43,3 +46,36 @@ def make_html_element(html: str, url=DEFAULT_URL) -> Element:

def html2text(html: str) -> str:
return _html2text(html)


month = r"Jan(?:uary)?|" \
r"Feb(?:ruary)?|" \
r"Mar(?:ch)?|" \
r"Apr(?:il)?|" \
r"May|" \
r"Jun(?:e)?|" \
r"Jul(?:y)?|" \
r"Aug(?:ust)?|" \
r"Sep(?:tember)?|" \
r"Oct(?:ober)?|" \
r"Nov(?:ember)?|" \
r"Dec(?:ember)?|" \
r"Yesterday|" \
r"Today"
date = f"(?:{month}) " + r"\d{1,2}" + r"(?:, \d{4})?"
hour = r"\d{1,2}"
minute = r"\d{2}"
period = r"AM|PM"
exact_time = f"(?:{date}) at {hour}:{minute} (?:{period})"
relative_time = r"\b\d{1,2}(?:h| hrs)"

datetime_regex = re.compile(fr"({exact_time}|{relative_time})")


def parse_datetime(element_full_text: str) -> Optional[datetime]:
time_match = datetime_regex.search(element_full_text)
if time_match:
time = time_match.group(0)
return dateparser.parse(time)
else:
return None
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,4 @@ yarl==1.4.2; python_version >= "3.6" \
zipp==3.1.0; python_version < "3.8" \
--hash=sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b \
--hash=sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96
dateparser~=1.0.0
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ websockets==8.0.2 \
--hash=sha256:f5cb2683367e32da6a256b60929a3af9c29c212b5091cf5bace9358d03011bf5 \
--hash=sha256:049e694abe33f8a1d99969fee7bfc0ae6761f7fd5f297c58ea933b27dd6805f2 \
--hash=sha256:882a7266fa867a2ebb2c0baaa0f9159cabf131cf18c1b4270d79ad42f9208dc5

html2text~=2020.1.16
requests~=2.24.0
dateparser~=1.0.0
kevinzg marked this conversation as resolved.
Show resolved Hide resolved