Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
my.google.maps: intitial module for extracting placed data from Andro…
…id app
- Loading branch information
Showing
4 changed files
with
320 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from my.core import __NOT_HPI_MODULE__ | ||
|
||
# NOTE: this tool was quite useful https://github.com/aj3423/aproto | ||
|
||
from google.protobuf import descriptor_pool, descriptor_pb2, message_factory | ||
|
||
TYPE_STRING = descriptor_pb2.FieldDescriptorProto.TYPE_STRING | ||
TYPE_BYTES = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES | ||
TYPE_UINT64 = descriptor_pb2.FieldDescriptorProto.TYPE_UINT64 | ||
TYPE_MESSAGE = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE | ||
|
||
OPTIONAL = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL | ||
REQUIRED = descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED | ||
|
||
|
||
def get_place_protos(): | ||
f1 = descriptor_pb2.DescriptorProto(name='xf1') | ||
# TODO 2 -> 5 is address? 2 -> 6 is a pair of coordinates | ||
f1.field.add(name='title', number=3, type=TYPE_STRING, label=REQUIRED) | ||
f1.field.add(name='note' , number=4, type=TYPE_STRING, label=OPTIONAL) | ||
# TODO what's the difference between required and optional? doesn't impact decoding? | ||
|
||
ts = descriptor_pb2.DescriptorProto(name='Timestamp') | ||
ts.field.add(name='seconds', number=1, type=TYPE_UINT64, label=REQUIRED) | ||
ts.field.add(name='nanos' , number=2, type=TYPE_UINT64, label=REQUIRED) | ||
|
||
f1.field.add(name='created', number=10 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name) | ||
f1.field.add(name='updated', number=11 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name) | ||
|
||
f2 = descriptor_pb2.DescriptorProto(name='xf2') | ||
f2.field.add(name='addr1', number=2, type=TYPE_STRING, label=REQUIRED) | ||
f2.field.add(name='addr2', number=3, type=TYPE_STRING, label=REQUIRED) | ||
f2.field.add(name='f21' , number=4, type=TYPE_BYTES , label=REQUIRED) | ||
f2.field.add(name='f22' , number=5, type=TYPE_UINT64, label=REQUIRED) | ||
f2.field.add(name='f23' , number=6, type=TYPE_STRING, label=REQUIRED) | ||
# NOTE: this also contains place ID | ||
|
||
f3 = descriptor_pb2.DescriptorProto(name='xf3') | ||
# NOTE: looks like it's the same as 'updated' from above?? | ||
f3.field.add(name='f31', number=1, type=TYPE_UINT64, label=OPTIONAL) | ||
|
||
descriptor_proto = descriptor_pb2.DescriptorProto(name='PlaceParser') | ||
descriptor_proto.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name) | ||
descriptor_proto.field.add(name='f2', number=2, type=TYPE_MESSAGE, label=REQUIRED, type_name=f2.name) | ||
descriptor_proto.field.add(name='f3', number=3, type=TYPE_MESSAGE, label=OPTIONAL, type_name=f3.name) | ||
descriptor_proto.field.add(name='f4', number=4, type=TYPE_STRING , label=OPTIONAL) | ||
# NOTE: f4 is the list id | ||
|
||
return [descriptor_proto, ts, f1, f2, f3] | ||
|
||
|
||
def get_labeled_protos(): | ||
address = descriptor_pb2.DescriptorProto(name='address') | ||
# 1: address | ||
# 2: parts of address (multiple) | ||
# 3: full address | ||
address.field.add(name='full', number=3, type=TYPE_STRING, label=REQUIRED) | ||
|
||
main = descriptor_pb2.DescriptorProto(name='LabeledParser') | ||
# field 1 contains item type and item id | ||
main.field.add(name='title' , number=3, type=TYPE_STRING , label=REQUIRED) | ||
main.field.add(name='address', number=5, type=TYPE_MESSAGE, label=OPTIONAL, type_name=address.name) | ||
|
||
return [main, address] | ||
|
||
|
||
def get_list_protos(): | ||
f1 = descriptor_pb2.DescriptorProto(name='xf1') | ||
f1.field.add(name='name', number=5, type=TYPE_STRING, label=REQUIRED) | ||
|
||
main = descriptor_pb2.DescriptorProto(name='ListParser') | ||
main.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name) | ||
main.field.add(name='f2', number=2, type=TYPE_STRING , label=REQUIRED) | ||
|
||
return [main, f1] | ||
|
||
|
||
def make_parser(main, *extras): | ||
file_descriptor_proto = descriptor_pb2.FileDescriptorProto(name='dynamic.proto', package='dynamic_package') | ||
for proto in [main, *extras]: | ||
file_descriptor_proto.message_type.add().CopyFrom(proto) | ||
|
||
pool = descriptor_pool.DescriptorPool() | ||
file_descriptor = pool.Add(file_descriptor_proto) | ||
|
||
message_descriptor = pool.FindMessageTypeByName(f'{file_descriptor_proto.package}.{main.name}') | ||
factory = message_factory.MessageFactory(pool) | ||
dynamic_message_class = factory.GetPrototype(message_descriptor) | ||
|
||
return dynamic_message_class | ||
|
||
|
||
place_parser_class = make_parser(*get_place_protos()) | ||
labeled_parser_class = make_parser(*get_labeled_protos()) | ||
list_parser_class = make_parser(*get_list_protos()) | ||
|
||
|
||
def parse_place(blob: bytes): | ||
m = place_parser_class() | ||
m.ParseFromString(blob) | ||
return m | ||
|
||
|
||
def parse_labeled(blob: bytes): | ||
m = labeled_parser_class() | ||
m.ParseFromString(blob) | ||
return m | ||
|
||
|
||
def parse_list(blob: bytes): | ||
msg = list_parser_class() | ||
msg.ParseFromString(blob) | ||
return msg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
""" | ||
Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now) | ||
""" | ||
from __future__ import annotations | ||
|
||
REQUIRES = [ | ||
"protobuf", # for parsing blobs from the database | ||
] | ||
|
||
from dataclasses import dataclass | ||
from datetime import datetime, timezone | ||
from pathlib import Path | ||
from typing import Any, Iterator, Optional, Sequence | ||
from urllib.parse import quote | ||
|
||
from my.core import datetime_aware, get_files, LazyLogger, Paths, Res | ||
from my.core.common import unique_everseen | ||
from my.core.sqlite import sqlite_connection | ||
|
||
import my.config | ||
|
||
from ._android_protobuf import parse_labeled, parse_list, parse_place | ||
|
||
|
||
logger = LazyLogger(__name__) | ||
|
||
|
||
@dataclass | ||
class config(my.config.google.maps.android): | ||
# paths[s]/glob to the exported sqlite databases | ||
export_path: Paths | ||
|
||
|
||
def inputs() -> Sequence[Path]: | ||
# TODO note sure if need to use all dbs? possibly the last one contains everything? | ||
return get_files(config.export_path) | ||
|
||
|
||
PlaceId = str | ||
ListId = str | ||
ListName = str | ||
|
||
|
||
@dataclass(eq=True, frozen=True) | ||
class Location: | ||
lat: float | ||
lon: float | ||
|
||
@property | ||
def url(self) -> str: | ||
return f'https://maps.google.com/?q={self.lat},{self.lon}' | ||
|
||
|
||
@dataclass(unsafe_hash=True) | ||
class Place: | ||
id: PlaceId | ||
list_name: ListName # TODO maybe best to keep list id? | ||
created_at: datetime_aware # TODO double check it's utc? | ||
updated_at: datetime_aware # TODO double check it's utc? | ||
title: str | ||
location: Location | ||
address: Optional[str] | ||
note: Optional[str] | ||
|
||
@property | ||
def place_url(self) -> str: | ||
title = quote(self.title) | ||
return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}' | ||
|
||
@property | ||
def location_url(self) -> str: | ||
return self.location.url | ||
|
||
|
||
def _process_one(f: Path): | ||
with sqlite_connection(f, row_factory='row') as conn: | ||
msg: Any | ||
|
||
lists: dict[ListId, ListName] = {} | ||
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc) | ||
server_id = row['server_id'] | ||
|
||
if server_id is None: | ||
# this is the case for Travel plans, Followed places, Offers | ||
# todo alternatively could use string_index column instead maybe? | ||
continue | ||
|
||
blob = row['item_proto'] | ||
msg = parse_list(blob) | ||
name = msg.f1.name | ||
lists[server_id] = name | ||
|
||
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list | ||
ts = row['timestamp'] / 1000 | ||
created = datetime.fromtimestamp(ts, tz=timezone.utc) | ||
|
||
server_id = row['server_id'] | ||
[item_type, item_id] = server_id.split(':') | ||
if item_type != '3': | ||
# the ones that are not 3 are home/work address? | ||
continue | ||
|
||
blob = row['item_proto'] | ||
msg = parse_labeled(blob) | ||
address = msg.address.full | ||
if address == '': | ||
address = None | ||
|
||
location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) | ||
|
||
yield Place( | ||
id=item_id, | ||
list_name='Labeled', | ||
created_at=created, | ||
updated_at=created, # doesn't look like it has 'updated'? | ||
title=msg.title, | ||
location=location, | ||
address=address, | ||
note=None, # don't think these allow notes | ||
) | ||
|
||
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places | ||
server_id = row['server_id'] | ||
[list_id, _, id1, id2] = server_id.split(':') | ||
item_id = f'{id1}:{id2}' | ||
|
||
list_name = lists[list_id] | ||
|
||
blob = row['item_proto'] | ||
msg = parse_place(blob) | ||
title = msg.f1.title | ||
note = msg.f1.note | ||
if note == '': # seems that protobuf does that? | ||
note = None | ||
|
||
# TODO double check timezone | ||
created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000) | ||
|
||
# NOTE: this one seems to be the same as row['timestamp'] | ||
updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000) | ||
|
||
address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug: | ||
if address == '': | ||
address = None | ||
|
||
location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6) | ||
|
||
place = Place( | ||
id=item_id, | ||
list_name=list_name, | ||
created_at=created, | ||
updated_at=updated, | ||
title=title, | ||
location=location, | ||
address=address, | ||
note=note, | ||
) | ||
|
||
# ugh. in my case it's violated by one place by about 1 second?? | ||
# assert place.created_at <= place.updated_at | ||
yield place | ||
|
||
|
||
def saved() -> Iterator[Res[Place]]: | ||
def it() -> Iterator[Res[Place]]: | ||
paths = inputs() | ||
total = len(paths) | ||
width = len(str(total)) | ||
for idx, path in enumerate(paths): | ||
logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') | ||
yield from _process_one(path) | ||
return unique_everseen(it) | ||
|
||
|
||
# Summary of databases on Android (as of 20240101) | ||
# -1_optimized_threads.notifications.db -- empty | ||
# 1_optimized_threads.notifications.db -- empty | ||
# 1_tasks.notifications.db -- empty | ||
# -1_threads.notifications.db -- empty | ||
# 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc? | ||
# 1_thread_surveys.notifications.db -- empty | ||
# 2_threads.notifications.db -- empty | ||
# accounts.notifications.db -- just one row with account id | ||
# brella_example_store -- empty | ||
# gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled" | ||
# gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something | ||
# gmm_sync.db -- processed above | ||
# gnp_fcm_database -- list of accounts | ||
# google_app_measurement_local.db -- empty | ||
# inbox_notifications.db -- nothing interesting | ||
# <email>_room_notifications.db -- trip anniversaties? | ||
# lighter_messaging_1.db -- empty | ||
# lighter_messaging_2.db -- empty | ||
# lighter_registration.db -- empty | ||
# peopleCache_<email>_com.google_14.db -- contacts cache or something | ||
# portable_geller_<email>.db -- looks like analytics | ||
# primes_example_store -- looks like analytics | ||
# pseudonymous_room_notifications.db -- looks like analytics | ||
# ue3.db -- empty | ||
# ugc_photos_location_data.db -- empty | ||
# ugc-sync.db -- empty | ||
# updates-tab-visit.db -- empty |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters