Skip to content


Permalink intitial module for extracting placed data from Andro…
Browse files Browse the repository at this point in the history
…id app
  • Loading branch information
karlicoss committed Jan 1, 2024
1 parent 93e4757 commit 276f64f
Show file tree
Hide file tree
Showing 4 changed files with 320 additions and 0 deletions.
4 changes: 4 additions & 0 deletions my/
Expand Up @@ -68,6 +68,10 @@ class pinboard:
export_dir: Paths = ''

class google:
class maps:
class android:
export_path: Paths = ''

takeout_path: Paths = ''

Expand Down
113 changes: 113 additions & 0 deletions my/google/maps/
@@ -0,0 +1,113 @@
from my.core import __NOT_HPI_MODULE__

# NOTE: this tool was quite useful

from google.protobuf import descriptor_pool, descriptor_pb2, message_factory

TYPE_STRING = descriptor_pb2.FieldDescriptorProto.TYPE_STRING
TYPE_BYTES = descriptor_pb2.FieldDescriptorProto.TYPE_BYTES
TYPE_UINT64 = descriptor_pb2.FieldDescriptorProto.TYPE_UINT64
TYPE_MESSAGE = descriptor_pb2.FieldDescriptorProto.TYPE_MESSAGE

OPTIONAL = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
REQUIRED = descriptor_pb2.FieldDescriptorProto.LABEL_REQUIRED

def get_place_protos():
f1 = descriptor_pb2.DescriptorProto(name='xf1')
# TODO 2 -> 5 is address? 2 -> 6 is a pair of coordinates
f1.field.add(name='title', number=3, type=TYPE_STRING, label=REQUIRED)
f1.field.add(name='note' , number=4, type=TYPE_STRING, label=OPTIONAL)
# TODO what's the difference between required and optional? doesn't impact decoding?

ts = descriptor_pb2.DescriptorProto(name='Timestamp')
ts.field.add(name='seconds', number=1, type=TYPE_UINT64, label=REQUIRED)
ts.field.add(name='nanos' , number=2, type=TYPE_UINT64, label=REQUIRED)

f1.field.add(name='created', number=10 ,type=TYPE_MESSAGE, label=REQUIRED,
f1.field.add(name='updated', number=11 ,type=TYPE_MESSAGE, label=REQUIRED,

f2 = descriptor_pb2.DescriptorProto(name='xf2')
f2.field.add(name='addr1', number=2, type=TYPE_STRING, label=REQUIRED)
f2.field.add(name='addr2', number=3, type=TYPE_STRING, label=REQUIRED)
f2.field.add(name='f21' , number=4, type=TYPE_BYTES , label=REQUIRED)
f2.field.add(name='f22' , number=5, type=TYPE_UINT64, label=REQUIRED)
f2.field.add(name='f23' , number=6, type=TYPE_STRING, label=REQUIRED)
# NOTE: this also contains place ID

f3 = descriptor_pb2.DescriptorProto(name='xf3')
# NOTE: looks like it's the same as 'updated' from above??
f3.field.add(name='f31', number=1, type=TYPE_UINT64, label=OPTIONAL)

descriptor_proto = descriptor_pb2.DescriptorProto(name='PlaceParser')
descriptor_proto.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED,
descriptor_proto.field.add(name='f2', number=2, type=TYPE_MESSAGE, label=REQUIRED,
descriptor_proto.field.add(name='f3', number=3, type=TYPE_MESSAGE, label=OPTIONAL,
descriptor_proto.field.add(name='f4', number=4, type=TYPE_STRING , label=OPTIONAL)
# NOTE: f4 is the list id

return [descriptor_proto, ts, f1, f2, f3]

def get_labeled_protos():
address = descriptor_pb2.DescriptorProto(name='address')
# 1: address
# 2: parts of address (multiple)
# 3: full address
address.field.add(name='full', number=3, type=TYPE_STRING, label=REQUIRED)

main = descriptor_pb2.DescriptorProto(name='LabeledParser')
# field 1 contains item type and item id
main.field.add(name='title' , number=3, type=TYPE_STRING , label=REQUIRED)
main.field.add(name='address', number=5, type=TYPE_MESSAGE, label=OPTIONAL,

return [main, address]

def get_list_protos():
f1 = descriptor_pb2.DescriptorProto(name='xf1')
f1.field.add(name='name', number=5, type=TYPE_STRING, label=REQUIRED)

main = descriptor_pb2.DescriptorProto(name='ListParser')
main.field.add(name='f1', number=1, type=TYPE_MESSAGE, label=REQUIRED,
main.field.add(name='f2', number=2, type=TYPE_STRING , label=REQUIRED)

return [main, f1]

def make_parser(main, *extras):
file_descriptor_proto = descriptor_pb2.FileDescriptorProto(name='dynamic.proto', package='dynamic_package')
for proto in [main, *extras]:

pool = descriptor_pool.DescriptorPool()
file_descriptor = pool.Add(file_descriptor_proto)

message_descriptor = pool.FindMessageTypeByName(f'{file_descriptor_proto.package}.{}')
factory = message_factory.MessageFactory(pool)
dynamic_message_class = factory.GetPrototype(message_descriptor)

return dynamic_message_class

place_parser_class = make_parser(*get_place_protos())
labeled_parser_class = make_parser(*get_labeled_protos())
list_parser_class = make_parser(*get_list_protos())

def parse_place(blob: bytes):
m = place_parser_class()
return m

def parse_labeled(blob: bytes):
m = labeled_parser_class()
return m

def parse_list(blob: bytes):
msg = list_parser_class()
return msg
202 changes: 202 additions & 0 deletions my/google/maps/
@@ -0,0 +1,202 @@
Extracts data from the official Google Maps app for Android (uses gmm_sync.db for now)
from __future__ import annotations

"protobuf", # for parsing blobs from the database

from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterator, Optional, Sequence
from urllib.parse import quote

from my.core import datetime_aware, get_files, LazyLogger, Paths, Res
from my.core.common import unique_everseen
from my.core.sqlite import sqlite_connection

import my.config

from ._android_protobuf import parse_labeled, parse_list, parse_place

logger = LazyLogger(__name__)

class config(
# paths[s]/glob to the exported sqlite databases
export_path: Paths

def inputs() -> Sequence[Path]:
# TODO note sure if need to use all dbs? possibly the last one contains everything?
return get_files(config.export_path)

PlaceId = str
ListId = str
ListName = str

@dataclass(eq=True, frozen=True)
class Location:
lat: float
lon: float

def url(self) -> str:
return f'{},{self.lon}'

class Place:
id: PlaceId
list_name: ListName # TODO maybe best to keep list id?
created_at: datetime_aware # TODO double check it's utc?
updated_at: datetime_aware # TODO double check it's utc?
title: str
location: Location
address: Optional[str]
note: Optional[str]

def place_url(self) -> str:
title = quote(self.title)
return f'{title}/data=!4m2!3m1!1s{}'

def location_url(self) -> str:
return self.location.url

def _process_one(f: Path):
with sqlite_connection(f, row_factory='row') as conn:
msg: Any

lists: dict[ListId, ListName] = {}
for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 13'): # 13 looks like lists (e.g. saved/favorited etc)
server_id = row['server_id']

if server_id is None:
# this is the case for Travel plans, Followed places, Offers
# todo alternatively could use string_index column instead maybe?

blob = row['item_proto']
msg = parse_list(blob)
name =
lists[server_id] = name

for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 11'): # this looks like 'Labeled' list
ts = row['timestamp'] / 1000
created = datetime.fromtimestamp(ts, tz=timezone.utc)

server_id = row['server_id']
[item_type, item_id] = server_id.split(':')
if item_type != '3':
# the ones that are not 3 are home/work address?

blob = row['item_proto']
msg = parse_labeled(blob)
address = msg.address.full
if address == '':
address = None

location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)

yield Place(
updated_at=created, # doesn't look like it has 'updated'?
note=None, # don't think these allow notes

for row in conn.execute('SELECT * FROM sync_item_data WHERE corpus == 14'): # this looks like actual individual places
server_id = row['server_id']
[list_id, _, id1, id2] = server_id.split(':')
item_id = f'{id1}:{id2}'

list_name = lists[list_id]

blob = row['item_proto']
msg = parse_place(blob)
title = msg.f1.title
note = msg.f1.note
if note == '': # seems that protobuf does that?
note = None

# TODO double check timezone
created = datetime.fromtimestamp(msg.f1.created.seconds, tz=timezone.utc).replace(microsecond=msg.f1.created.nanos // 1000)

# NOTE: this one seems to be the same as row['timestamp']
updated = datetime.fromtimestamp(msg.f1.updated.seconds, tz=timezone.utc).replace(microsecond=msg.f1.updated.nanos // 1000)

address = msg.f2.addr1 # NOTE: there is also addr2, but they seem identical :shrug:
if address == '':
address = None

location = Location(lat=row['latitude_e6'] / 1e6, lon=row['longitude_e6'] / 1e6)

place = Place(

# ugh. in my case it's violated by one place by about 1 second??
# assert place.created_at <= place.updated_at
yield place

def saved() -> Iterator[Res[Place]]:
def it() -> Iterator[Res[Place]]:
paths = inputs()
total = len(paths)
width = len(str(total))
for idx, path in enumerate(paths):'processing [{idx:>{width}}/{total:>{width}}] {path}')
yield from _process_one(path)
return unique_everseen(it)

# Summary of databases on Android (as of 20240101)
# -1_optimized_threads.notifications.db -- empty
# 1_optimized_threads.notifications.db -- empty
# 1_tasks.notifications.db -- empty
# -1_threads.notifications.db -- empty
# 1_threads.notifications.db -- doesn't look like anything interested, some trip anniversaries etc?
# 1_thread_surveys.notifications.db -- empty
# 2_threads.notifications.db -- empty
# accounts.notifications.db -- just one row with account id
# brella_example_store -- empty
# gmm_myplaces.db -- contains just a few places? I think it's a subset of "Labeled"
# gmm_storage.db -- pretty huge, like 50Mb. I suspect it contains cache for places on maps or something
# gmm_sync.db -- processed above
# gnp_fcm_database -- list of accounts
# google_app_measurement_local.db -- empty
# inbox_notifications.db -- nothing interesting
# <email>_room_notifications.db -- trip anniversaties?
# lighter_messaging_1.db -- empty
# lighter_messaging_2.db -- empty
# lighter_registration.db -- empty
# peopleCache_<email>_com.google_14.db -- contacts cache or something
# portable_geller_<email>.db -- looks like analytics
# primes_example_store -- looks like analytics
# pseudonymous_room_notifications.db -- looks like analytics
# ue3.db -- empty
# ugc_photos_location_data.db -- empty
# ugc-sync.db -- empty
# updates-tab-visit.db -- empty
1 change: 1 addition & 0 deletions tox.ini
Expand Up @@ -143,6 +143,7 @@ commands =
my.fbmessenger.export \
my.github.ghexport \
my.goodreads \ \ \
my.hackernews.harmonic \
my.hypothesis \
Expand Down

0 comments on commit 276f64f

Please sign in to comment.