Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new source for Viber Desktop local sqlite db #204

Merged
merged 21 commits into from
Feb 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ea19236
FEAT: +Viber source from local sqlite db
ankostis Feb 22, 2021
44b3bfe
refact(viber) mv viber source to .._desktop
ankostis Feb 22, 2021
661f0c6
enh(viber) resolve homedir on db-path
ankostis Feb 22, 2021
837e687
doc(viber) sample-cfg & src-section
ankostis Feb 22, 2021
498e5e2
upd(viber) rename back to plain `viber` ...
ankostis Feb 23, 2021
4c2c5d6
doc(viber): minor sql & python comments
ankostis Feb 24, 2021
854dffd
style(viber) black-format
ankostis Feb 24, 2021
f4553ed
fix(viber) urls in sqlite are not quoted
ankostis Feb 24, 2021
a8c6dd0
fix(viber) don'unwrap not needed from sqlite, ...
ankostis Feb 24, 2021
14c6239
fix(viber) locator-uri includes resolved sqlite-path
ankostis Feb 24, 2021
5fa10aa
refact(viber) msgs-query in own func,
ankostis Feb 24, 2021
c45d4d7
fix(viber) open db in context-manager (old TODO)
ankostis Feb 24, 2021
df948c3
ENH(viber) hard-link db-->.sqlite for locator-urls
ankostis Feb 24, 2021
d5681df
doc: sample cfg use module, not module.index
ankostis Feb 24, 2021
0882d57
refact(viber) uneeded closure
ankostis Feb 24, 2021
03ba716
refact(viber) un-closureify handle_row()
ankostis Feb 24, 2021
7f46da7
refact(viber) privatize open_db(); doc
ankostis Feb 24, 2021
aea22ff
enh(viber) log db hard-link creation
ankostis Feb 24, 2021
ab9004b
refact(viber) org imports, drop dead code
ankostis Feb 24, 2021
2d5c3c4
ENH(viber) support multiple GLOBbed DB-paths,
ankostis Feb 24, 2021
264f543
drop(viber) hard-link hack from .vb --> .sqlite,
ankostis Feb 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/SOURCES.org
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ for importer, name, ispkg in sorted(pkgutil.walk_packages(

Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data

- [[file:../src/promnesia/sources/viber.py][promnesia.sources.viber]]

Uses all local SQLite files found in your Viber Desktop configurations:
usually in =~/.ViberPC/**/viber.db= (one directory for each telephone number).

- [[file:../src/promnesia/sources/twitter.py][promnesia.sources.twitter]]

Uses [[https://github.com/karlicoss/HPI][HPI]] for Twitter data.
Expand Down
21 changes: 19 additions & 2 deletions doc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,17 @@

# this is an incomplete list, just the (perhaps) most interesting ones
from promnesia.sources import telegram
from promnesia.sources import takeout, instapaper, pocket, fbmessenger, twitter, roamresearch, hypothesis, rss
from promnesia.sources import (
fbmessenger,
hypothesis,
instapaper,
pocket,
roamresearch,
rss,
takeout,
twitter,
viber,
)


# NOTE: at the moment try to avoid using complex sources names
Expand Down Expand Up @@ -65,10 +75,17 @@
# Uses the output of telegram_backup tool: https://github.com/fabianonline/telegram_backup#usage
# name will be set to 'telegram' by default
Source(
telegram.index,
telegram,
'/data/telegram/database.sqlite',
),

# Uses all local SQLite files found in your Viber Desktop configurations
# (one directory for each telephone number):
# ~/.ViberPC/**/viber.db
#
# You may modify that by providing a 2nd ``Source()`` argument.
viber,

# NOTE: to configure the following modules you need to set up HPI package (https://github.com/karlicoss/HPI#whats-inside)
# see HPI setup guide : https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org
# and HPI usage examples': https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org#usage-examples
Expand Down
179 changes: 179 additions & 0 deletions src/promnesia/sources/viber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
Adapted from `telegram.py` to read from `~/.ViberPC/XYZ123/viber.db`
"""

import json
import logging
import textwrap
from os import PathLike
from pathlib import Path
from typing import Iterable

from ..common import Loc, PathIsh, Results, Visit, extract_urls, from_epoch

logger = logging.getLogger(__name__)


# TODO move to common?
def _dataset_readonly(db: Path):
# see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
import sqlite3

import dataset # type: ignore

creator = lambda: sqlite3.connect(f"file:{db}?immutable=1", uri=True)
return dataset.connect("sqlite:///", engine_kwargs={"creator": creator})


def messages_query() -> str:
"""
An SQL-query returning 1 row for each message

A non-private method, to facilitate experimentation.
"""
return textwrap.dedent(
f"""
/*
Establish group-names by concatenating:
- groups explicitely named,
- multi-groups having a group-leader (PGRole=2), with
- all the rest groups that (must) have just 2 members,
me(ContactId=1) & "other+" contacts, so use "other"" as group-name.
*/
WITH G0 AS (
SELECT
CR.ChatId,
CR.ContactID,
coalesce(CI.Name, C.Name, C.ClientName) as chatname,
CR.PGRole,
CI.PGTags
FROM ChatRelation as CR
JOIN Contact AS C ON CR.ContactID = C.ContactID
JOIN ChatInfo as CI ON CR.ChatId = CI.ChatId
), G1 AS (
SELECT * FROM G0 WHERE PGRole = 2
), G2 AS (
SELECT * FROM G0 WHERE
ContactID <> 1 AND
ChatId NOT IN (SELECT ChatId FROM G1)
), Groups AS (
SELECT ChatId, chatname, PGTags FROM G1
UNION
SELECT ChatId, chatname, PGTags FROM G2
)
SELECT
M.EventId AS mid,
E.TimeStamp AS time,
G.chatname AS chatname,
coalesce(
S.Name,
S.ClientName,
'(' || S.Number || ')' /* contacts have one xor the other, but failsafe */
) AS sender,
coalesce(M.Subject, M.Body) /* didn't see any msg with both */
AS text,
M.info AS infojson, /* to harvested titles from embedded urls */
G.PGTags AS tags
FROM messages AS M
LEFT JOIN Events AS E
ON M.EventId = E.EventId
LEFT JOIN Contact AS S
ON E.ContactId = S.ContactId
LEFT JOIN Groups AS G
ON E.ChatId = G.ChatId
WHERE
text LIKE '%http%'
ORDER BY time;
"""
)


def _parse_json_title(js) -> str:
if js and js.strip():
js = json.loads(js)
if isinstance(js, dict):
return js.get("Title")


def _handle_row(row: dict, db_path: PathLike) -> Results:
text = row["text"]
urls = extract_urls(text)
if not urls:
return

dt = from_epoch(row["time"] // 1000) # timestamps are stored x100 this db
mid: str = row["mid"]
# TODO perhaps we could be defensive with null sender/chat etc and still emit the Visit
sender: str = row["sender"]
chatname: str = row["chatname"]
sender: str = row["sender"]
tags: str = row["tags"]
infojson: str = row["infojson"]

assert (
text and mid and sender and chatname
), f"sql-query should eliminate messages without 'http' or missing ids: {row}"

if tags and tags.strip():
tags = "".join(f"#{t}" for t in tags.split())
text = f"{text}\n\n{tags}"

url_title = _parse_json_title(infojson)
if url_title:
text = f"title: {url_title}\n\n{text}"

for u in urls:
yield Visit(
url=u, # URLs in Viber's SQLite are not quoted
dt=dt,
context=text,
locator=Loc.make(
title=f"chat({mid}) from {sender}@{chatname}",
href=f"file://{db_path}#!Messages.EventId={mid}",
),
)


def _get_files(path: PathIsh) -> Iterable[Path]:
"""
Expand homedir(`~`) and return glob paths matched.

Expansion code copied from https://stackoverflow.com/a/51108375/548792
"""
path = Path(path).expanduser()
parts = path.parts[1:] if path.is_absolute() else path.parts
return Path(path.root).glob(str(Path("").joinpath(*parts)))


def _harvest_db(db_path: PathIsh, msgs_query: str):
is_debug = logger.isEnabledFor(logging.DEBUG)

# Note: for displaying maybe better not to expand/absolute,
# but it's safer for debugging resolved.
db_path = db_path.resolve()

with _dataset_readonly(db_path) as db:
for row in db.query(msgs_query):
try:
yield from _handle_row(row, db_path)
except Exception as ex:
# TODO: also insert errors in db
logger.warning(
"Cannot extract row: %s, due to: %s(%s)",
row,
type(ex).__name__,
ex,
exc_info=is_debug,
)


def index(db_path: PathIsh = "~/.ViberPC/*/viber.db") -> Results:
glob_paths = list(_get_files(db_path))
logger.debug("Expanded path(s): %s", glob_paths)
assert glob_paths, f"No Viber-desktop sqlite found: {db_path}"

msgs_query = messages_query()

for db_path in _get_files(db_path):
assert db_path.is_file(), f"Is it a (Viber-desktop sqlite) file? {db_path}"
yield from _harvest_db(db_path, msgs_query)