my.google.maps: intitial module for extracting placed data from Andro…

…id app
karlicoss · Jan 1, 2024 · 87a8a77 · 87a8a77
1 parent 93e4757
commit 87a8a77
Show file tree

Hide file tree

Showing 4 changed files with 320 additions and 0 deletions.
diff --git a/my/config.py b/my/config.py
@@ -68,6 +68,10 @@ class pinboard:
     export_dir: Paths = ''
 
 class google:
+    class maps:
+        class android:
+            export_path: Paths = ''
+
     takeout_path: Paths = ''
 
 

diff --git a/my/google/maps/_android_protobuf.py b/my/google/maps/_android_protobuf.py
@@ -0,0 +1,113 @@
+from my.core import __NOT_HPI_MODULE__
+
+# NOTE: this tool was quite useful https://github.com/aj3423/aproto
+
+from google.protobuf import descriptor_pool, descriptor_pb2, message_factory
+
+TYPE_STRING  = descriptor_pb2.FieldDescriptorProto.TYPE_STRING
+TYPE_BYTES   = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES
+TYPE_UINT64  = descriptor_pb2.FieldDescriptorProto.TYPE_UINT64
+TYPE_MESSAGE = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE
+
+OPTIONAL = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
+REQUIRED = descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED
+
+
+def get_place_protos():
+    f1 = descriptor_pb2.DescriptorProto(name='xf1')
+    # TODO 2 -> 5 is address? 2 -> 6 is a pair of coordinates
+    f1.field.add(name='title', number=3, type=TYPE_STRING, label=REQUIRED)
+    f1.field.add(name='note' , number=4, type=TYPE_STRING, label=OPTIONAL)
+    # TODO what's the difference between required and optional? doesn't impact decoding?
+
+    ts = descriptor_pb2.DescriptorProto(name='Timestamp')
+    ts.field.add(name='seconds', number=1, type=TYPE_UINT64, label=REQUIRED)
+    ts.field.add(name='nanos'  , number=2, type=TYPE_UINT64, label=REQUIRED)
+
+    f1.field.add(name='created', number=10 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name)
+    f1.field.add(name='updated', number=11 ,type=TYPE_MESSAGE, label=REQUIRED, type_name=ts.name)
+
+    f2 = descriptor_pb2.DescriptorProto(name='xf2')
+    f2.field.add(name='addr1', number=2, type=TYPE_STRING, label=REQUIRED)
+    f2.field.add(name='addr2', number=3, type=TYPE_STRING, label=REQUIRED)
+    f2.field.add(name='f21'  , number=4, type=TYPE_BYTES , label=REQUIRED)
+    f2.field.add(name='f22'  , number=5, type=TYPE_UINT64, label=REQUIRED)
+    f2.field.add(name='f23'  , number=6, type=TYPE_STRING, label=REQUIRED)
+    # NOTE: this also contains place ID
+
+    f3 = descriptor_pb2.DescriptorProto(name='xf3')
+    # NOTE: looks like it's the same as 'updated' from above??
+    f3.field.add(name='f31', number=1, type=TYPE_UINT64, label=OPTIONAL)
+
+    descriptor_proto = descriptor_pb2.DescriptorProto(name='PlaceParser')
+    descriptor_proto.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name)
+    descriptor_proto.field.add(name='f2', number=2, type=TYPE_MESSAGE, label=REQUIRED, type_name=f2.name)
+    descriptor_proto.field.add(name='f3', number=3, type=TYPE_MESSAGE, label=OPTIONAL, type_name=f3.name)
+    descriptor_proto.field.add(name='f4', number=4, type=TYPE_STRING , label=OPTIONAL)
+    # NOTE: f4 is the list id
+
+    return [descriptor_proto, ts, f1, f2, f3]
+
+
+def get_labeled_protos():
+    address = descriptor_pb2.DescriptorProto(name='address')
+    # 1: address
+    # 2: parts of address (multiple)
+    # 3: full address
+    address.field.add(name='full', number=3, type=TYPE_STRING, label=REQUIRED)
+
+    main = descriptor_pb2.DescriptorProto(name='LabeledParser')
+    # field 1 contains item type and item id
+    main.field.add(name='title'  , number=3, type=TYPE_STRING , label=REQUIRED)
+    main.field.add(name='address', number=5, type=TYPE_MESSAGE, label=OPTIONAL, type_name=address.name)
+
+    return [main, address]
+
+
+def get_list_protos():
+    f1 = descriptor_pb2.DescriptorProto(name='xf1')
+    f1.field.add(name='name', number=5, type=TYPE_STRING, label=REQUIRED)
+
+    main = descriptor_pb2.DescriptorProto(name='ListParser')
+    main.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED, type_name=f1.name)
+    main.field.add(name='f2', number=2, type=TYPE_STRING , label=REQUIRED)
+
+    return [main, f1]
+
+
+def make_parser(main, *extras):
+    file_descriptor_proto = descriptor_pb2.FileDescriptorProto(name='dynamic.proto', package='dynamic_package')
+    for proto in [main, *extras]:
+        file_descriptor_proto.message_type.add().CopyFrom(proto)
+
+    pool = descriptor_pool.DescriptorPool()
+    file_descriptor = pool.Add(file_descriptor_proto)
+
+    message_descriptor = pool.FindMessageTypeByName(f'{file_descriptor_proto.package}.{main.name}')
+    factory = message_factory.MessageFactory(pool)
+    dynamic_message_class = factory.GetPrototype(message_descriptor)
+
+    return dynamic_message_class
+
+
+place_parser_class   = make_parser(*get_place_protos())
+labeled_parser_class = make_parser(*get_labeled_protos())
+list_parser_class    = make_parser(*get_list_protos())
+
+
+def parse_place(blob: bytes):
+    m = place_parser_class()
+    m.ParseFromString(blob)
+    return m
+
+
+def parse_labeled(blob: bytes):
+    m = labeled_parser_class()
+    m.ParseFromString(blob)
+    return m
+
+
+def parse_list(blob: bytes):
+    msg = list_parser_class()
+    msg.ParseFromString(blob)
+    return msg
diff --git a/my/google/maps/android.py b/my/google/maps/android.py
@@ -0,0 +1,202 @@
+"""
+Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now)
+"""
+from __future__ import annotations
+
+REQUIRES = [
+    "protobuf",  # for parsing blobs from the database
+]
+
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterator, Optional, Sequence
+from urllib.parse import quote
+
+from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
+from my.core.common import unique_everseen
+from my.core.sqlite import sqlite_connection
+
+import my.config
+
+from ._android_protobuf import parse_labeled, parse_list, parse_place
+
+
+logger = LazyLogger(__name__)
+
+
+@dataclass
+class config(my.config.google.maps.android):
+    # paths[s]/glob to the exported sqlite databases
+    export_path: Paths
+
+
+def inputs() -> Sequence[Path]:
+    # TODO note sure if need to use all dbs? possibly the last one contains everything?
+    return get_files(config.export_path)
+
+
+PlaceId = str
+ListId = str
+ListName = str
+
+
+@dataclass(eq=True, frozen=True)
+class Location:
+    lat: float
+    lon: float
+
+    @property
+    def url(self) -> str:
+        return f'https://maps.google.com/?q={self.lat},{self.lon}'
+
+
+@dataclass(unsafe_hash=True)
+class Place:
+    id: PlaceId
+    list_name: ListName  # TODO maybe best to keep list id?
+    created_at: datetime_aware  # TODO double check it's utc?
+    updated_at: datetime_aware  # TODO double check it's utc?
+    title: str
+    location: Location
+    address: Optional[str]
+    note: Optional[str]
+
+    @property
+    def place_url(self) -> str:
+        title = quote(self.title)
+        return f'https://www.google.com/maps/place/{title}/data=!4m2!3m1!1s{self.id}'
+
+    @property
+    def location_url(self) -> str:
+        return self.location.url
+
+
+def _process_one(f: Path):
+    with sqlite_connection(f, row_factory='row') as conn:
+        msg: Any
+
+        lists: dict[ListId, ListName] = {}
+        for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'):  # 13 looks like lists (e.g. saved/favorited etc)
+            server_id = row['server_id']
+
+            if server_id is None:
+                # this is the case for Travel plans, Followed places, Offers
+                # todo alternatively could use string_index column instead maybe?
+                continue
+
+            blob = row['item_proto']
+            msg = parse_list(blob)
+            name = msg.f1.name
+            lists[server_id] = name
+
+        for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'):  # this looks like 'Labeled' list
+            ts = row['timestamp'] / 1000
+            created = datetime.fromtimestamp(ts, tz=timezone.utc)
+
+            server_id = row['server_id']
+            [item_type, item_id] = server_id.split(':')
+            if item_type != '3':
+                # the ones that are not 3 are home/work address?
+                continue
+
+            blob = row['item_proto']
+            msg = parse_labeled(blob)
+            address = msg.address.full
+            if address == '':
+                address = None
+
+            location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)
+
+            yield Place(
+                id=item_id,
+                list_name='Labeled',
+                created_at=created,
+                updated_at=created,  # doesn't look like it has 'updated'?
+                title=msg.title,
+                location=location,
+                address=address,
+                note=None,  # don't think these allow notes
+            )
+
+        for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'):  # this looks like actual individual places
+            server_id = row['server_id']
+            [list_id, _, id1, id2] = server_id.split(':')
+            item_id = f'{id1}:{id2}'
+
+            list_name = lists[list_id]
+
+            blob = row['item_proto']
+            msg = parse_place(blob)
+            title = msg.f1.title
+            note = msg.f1.note
+            if note == '':  # seems that protobuf does that?
+                note = None
+
+            # TODO double check timezone
+            created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000)
+
+            # NOTE: this one seems to be the same as row['timestamp']
+            updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000)
+
+            address = msg.f2.addr1  # NOTE: there is also addr2, but they seem identical :shrug:
+            if address == '':
+                address = None
+
+            location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)
+
+            place = Place(
+                id=item_id,
+                list_name=list_name,
+                created_at=created,
+                updated_at=updated,
+                title=title,
+                location=location,
+                address=address,
+                note=note,
+            )
+
+            # ugh. in my case it's violated by one place by about 1 second??
+            # assert place.created_at <= place.updated_at
+            yield place
+
+
+def saved() -> Iterator[Res[Place]]:
+    def it() -> Iterator[Res[Place]]:
+        paths = inputs()
+        total = len(paths)
+        width = len(str(total))
+        for idx, path in enumerate(paths):
+            logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
+            yield from _process_one(path)
+    return unique_everseen(it)
+
+
+# Summary of databases on Android (as of 20240101)
+# -1_optimized_threads.notifications.db -- empty
+# 1_optimized_threads.notifications.db  -- empty
+# 1_tasks.notifications.db              -- empty
+# -1_threads.notifications.db           -- empty
+# 1_threads.notifications.db            -- doesn't look like anything interested, some trip anniversaries etc?
+# 1_thread_surveys.notifications.db     -- empty
+# 2_threads.notifications.db            -- empty
+# accounts.notifications.db             -- just one row with account id
+# brella_example_store                  -- empty
+# gmm_myplaces.db                       -- contains just a few places? I think it's a subset of "Labeled"
+# gmm_storage.db                        -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something
+# gmm_sync.db                           -- processed above
+# gnp_fcm_database                      -- list of accounts
+# google_app_measurement_local.db       -- empty
+# inbox_notifications.db                -- nothing interesting
+# <email>_room_notifications.db         -- trip anniversaties?
+# lighter_messaging_1.db                -- empty
+# lighter_messaging_2.db                -- empty
+# lighter_registration.db               -- empty
+# peopleCache_<email>_com.google_14.db  -- contacts cache or something
+# portable_geller_<email>.db            -- looks like analytics
+# primes_example_store                  -- looks like analytics
+# pseudonymous_room_notifications.db    -- looks like analytics
+# ue3.db                                -- empty
+# ugc_photos_location_data.db           -- empty
+# ugc-sync.db                           -- empty
+# updates-tab-visit.db                  -- empty
diff --git a/tox.ini b/tox.ini
@@ -143,6 +143,7 @@ commands =
         my.fbmessenger.export     \
         my.github.ghexport        \
         my.goodreads              \
+        my.google.maps.android    \
         my.google.takeout.parser  \
         my.hackernews.harmonic    \
         my.hypothesis             \