In [22]:
import json
import pandas as pd
from matplotlib import pyplot as plt

#### Let's explore tags content for a sample of changesets

In [66]:
# First we run the fetch_and_process_changesets function locally
range_inf,range_sup = 6135330, 6135350
!python manage.py shell -c \
    "from changesets.osm_fetcher import fetch_and_process_changesets; fetch_and_process_changesets({range_inf},{range_sup}, save_locally=True)"

Sequence 6135330 already processed.
Sequence 6135331 already processed.
Sequence 6135332 already processed.
Sequence 6135333 already processed.
Sequence 6135334 already processed.
Sequence 6135335 already processed.
Sequence 6135336 already processed.
Sequence 6135337 already processed.
Sequence 6135338 already processed.
Sequence 6135339 already processed.
Sequence 6135340 already processed.
Sequence 6135341 already processed.
Sequence 6135342 already processed.
Sequence 6135343 already processed.
Sequence 6135344 already processed.
Sequence 6135345 already processed.
Sequence 6135346 already processed.
Sequence 6135347 already processed.
Sequence 6135348 already processed.
Sequence 6135349 already processed.
Sequence 6135350 already processed.


#### We read the local data and save it in "all_data" variable

In [67]:
all_data = []
changesets = range(range_inf, range_sup)
# import .jsonl files from output directory
for i in changesets:
    with open(f'output/{i}.jsonl', 'r') as f:
        data = [json.loads(line) for line in f]
        all_data.extend(data)
    

In [68]:
len(all_data)

1054

### Number of unique tag keys:

In [69]:
def get_unique_tag_keys(data):
    set_ = set()
    for d in data:
        set_ = set_.union(set(d['tags'].keys()))
    return set_

print(len(get_unique_tag_keys(all_data)))
print(get_unique_tag_keys(all_data))

56


### Let's sum up all the unique tags in a DataFrame

In [70]:
all_tags = []
for data in all_data:
    all_tags.append(data['tags'])

df_tags = pd.DataFrame(all_tags)

df_tags.head(3)

Unnamed: 0,comment,created_by,locale,source,StreetComplete:quest_type,changesets_count,host,ideditor:walkthrough_completed,ideditor:walkthrough_progress,ideditor:walkthrough_started,...,warnings:crossing_ways:building-building,resolved:crossing_ways:building-highway,resolved:crossing_ways:highway-railway,created_by:library,website,answer,warnings:close_nodes:vertices,resolved:crossing_ways:highway-highway,warnings:outdated_tags:noncanonical_brand,resolved:crossing_ways:building-building
0,Specify parking types,StreetComplete 58.2,de-DE,survey,AddParkingType,,,,,,...,,,,,,,,,,
1,Specify whether public transport stops have bins,StreetComplete 58.2,de-DE,survey,AddBinStatusOnBusStop,,,,,,...,,,,,,,,,,
2,Specify whether pedestrian crossings have islands,StreetComplete 58.2,de-DE,survey,AddCrossingIsland,,,,,,...,,,,,,,,,,


### Number of unique values per tag ?
#### (less unique values makes the tag a good candidate for becoming a Changeset model attribute)

In [71]:
df_uniques = pd.DataFrame(df_tags.nunique()).reset_index().rename(columns={'index': 'tag', 0: 'unique_values'})
df_uniques

Unnamed: 0,tag,unique_values
0,comment,543
1,created_by,73
2,locale,47
3,source,99
4,StreetComplete:quest_type,111
5,changesets_count,322
6,host,13
7,ideditor:walkthrough_completed,1
8,ideditor:walkthrough_progress,8
9,ideditor:walkthrough_started,1


In [89]:
df_tags['hashtags'].unique()

array([nan, '#hotosm-project-16817;#UNMappers;#UNMOGIP;#Jammu;#Kashmir',
       '#geohealthresearch-project-26;#PivotscienceActivity', '#ola;#rdv',
       '#irsju', '#hotosm-project-16875;#GeOSM;#PUC', '#adt',
       '#hotosm-project-17137;#bloomberg;#emea',
       '#hotosm-project-15229;#OSMIndia;#OSMNorthEastIndia;#GREd;#GREdFoundation;#AssamMapping23;#OMGuru;#APHub;#CCC2024;#tt_event',
       '#BANO;#Pifometre', '#amap', '#tomtom;#tt_mapfeedback',
       '#hotosm-project-14389;#MangochiDistrict;#msf;#missingmaps;#malawicholeraoutbreak',
       '#maproulette', '#OpenCampingMap', '#lgmaps', '#swiggy',
       '#PUC;#EST;#tt_event',
       '#hotosm-project-15188;#Ndjamena;#msf;#missingmaps;#Ndjamena2023;#Msft;#24VR',
       '#localknowledge;#grabosm',
       '#hotosm-project-16259;#osmTaiwan;#NLSCopendata;#rivertracing',
       '#MSFTOpenMaps',
       '#hotosm-project-17266;#UPMAPPERS;#WAFARI;#YouthMappers;#SouthAfrica;#ESADisasterResponse',
       '#maproulette;#tomtom', '#mapbox_linte

### For the "unique" unique values (=1), what are they for each key ? 

In [72]:
# list of tags with only 1 unique value
l_1 = list(df_uniques['tag'][df_uniques['unique_values'] == 1])
l_1[-5:] # last five

['answer',
 'resolved:crossing_ways:highway-highway',
 'resolved:crossing_ways:building-building']

In [73]:
df_tags['create'][~df_tags['create'].isna()]

352    1
838    1
Name: create, dtype: object

In [83]:
# show df for lines 352 and 838 : two changesets concerned by the same tag (NaN for all other changesets)
df_tags.iloc[[352,838]].T

Unnamed: 0,352,838
comment,Adding data with #MapComplete for theme #ghost...,Adding data with #MapComplete for theme #hotels
created_by,MapComplete 0.44.13,MapComplete 0.44.13
locale,de-DE,en
source,,
StreetComplete:quest_type,,
changesets_count,,
host,https://mapcomplete.org/ghostbikes.html,https://mapcomplete.org/hotels.html
ideditor:walkthrough_completed,,
ideditor:walkthrough_progress,,
ideditor:walkthrough_started,,


In [75]:
df_tags['description'].value_counts().rename_axis('unique_values').reset_index(name='count')

Unnamed: 0,unique_values,count
0,https://wiki.openstreetmap.org/wiki/Automated_...,2


### DataFrame that sums up the occurrence of each unique uniques (unique values == 1)

In [84]:
df_tags[l_1].apply(pd.Series.value_counts).rename_axis('unique unique_values').T

unique unique_values,1,10,2,3820706,4,6,OSM Request 1.2.9,alias8,app.organicmaps,https://community.openstreetmap.org/t/mass-remove-gnis-created-and-similar-tags/107018,https://github.com/matkoniecz/osm_bot_abstraction_layer,https://matkoniecz.github.io/OSM-wikipedia-tag-validator-reports/,https://pic4review.pavie.info/#/mission/2732/review,https://wiki.openstreetmap.org/wiki/Automated_Edits/b-jazz,https://wiki.openstreetmap.org/wiki/Mechanical_Edits/Mateusz_Konieczny_-_bot_account/remove_not_needed_GNIS_tags,yes
ideditor:walkthrough_completed,,,,,,,,,,,,,,,,3.0
ideditor:walkthrough_started,,,,,,,,,,,,,,,,24.0
resolved:almost_junction:highway-highway,1.0,,,,,,,,,,,,,,,
resolved:help_request:fixme_tag,3.0,,,,,,,,,,,,,,,
resolved:mismatched_geometry:point_as_vertex,2.0,,,,,,,,,,,,,,,
resolved:close_nodes:vertices,,,1.0,,,,,,,,,,,,,
warnings:missing_tag:descriptive,1.0,,,,,,,,,,,,,,,
bundle_id,,,,,,,,,7.0,,,,,,,
warnings:crossing_ways:building-highway,1.0,,,,,,,,,,,,,,,
closed:note,,,,1.0,,,,,,,,,,,,
