## Wikidata linked data

Collect wikidata ids from lists in folder `inferences/wikidata-referencees`

In [1]:
import glob
import pandas as pd

# get list of all csv files in directory `../inferences/wikidata-references/`
csv_files = glob.glob("../inferences/wikidata-references/*.csv")
wikidata_entities = set()
for file in csv_files:
    print(file)
    df = pd.read_csv(file)
    # Assuming the wikidata id column is named 'wikidata_id'
    if 'wikidata_id' in df.columns:
        wikidata_entities.update(df['wikidata_id'].dropna().astype(str))
    print(len(df))
wikidata_entities = list(set(wikidata_entities))
print(len(wikidata_entities))


../inferences/wikidata-references/residences-1701.csv
144
../inferences/wikidata-references/residences-1644-1701.csv
359
../inferences/wikidata-references/locations_names_wikidata.csv
923
../inferences/wikidata-references/residences-1644.csv
215
1076


## Fetch wikidata data

In [2]:
%pip install pywikibot

.bash_profile RUN!
Note: you may need to restart the kernel to use updated packages.


Get all the wikidata ids in the database


In [3]:
from dehergne_util import locations_wikidata_info_file

# preload dataframe entities_df from xls file "../inferences/locations_wikidata_info.xlsx"
entities_cache = pd.read_excel(locations_wikidata_info_file, index_col=0)


In [4]:
entities_cache.loc['Q1011103',]


chinese_label                                                      富阳区
english_label                                          Fuyang District
portuguese_label                                                Fuyang
english_description               district of Zhejiang Province, China
chinese_description                                       中国浙江省杭州市的市辖区
portuguese_description                                             NaN
coordinates                                      (30.04998, 119.93697)
latitude                                                      30.04998
longitude                                                    119.93697
administrative_entity_id                                         Q4970
administrative_entity_label_en                                Hangzhou
administrative_entity_label_zh                                     杭州市
country_id                                                        Q148
country_label                               People's Republic of China
label 

In [5]:
entities_cache.sample(10)

Unnamed: 0_level_0,chinese_label,english_label,portuguese_label,english_description,chinese_description,portuguese_description,coordinates,latitude,longitude,administrative_entity_id,administrative_entity_label_en,administrative_entity_label_zh,country_id,country_label,label
wikidata_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Q117676,維塞烏,Viseu,Viseu,municipality and city in Portugal,葡萄牙维塞乌区市镇,município e cidade de Portugal,"(40.6575, -7.913888888888889)",40.6575,-7.913889,Q273525,Viseu,維塞烏區,Q45,Portugal,Viseu / 維塞烏
Q215755,湘潭市,Xiangtan,Xiangtan,"prefecture-level city in Hunan, China",中國湖南省的地級市,,"(27.84306, 112.92283)",27.84306,112.92283,Q45761,Hunan,湖南省,Q148,People's Republic of China,Xiangtan / 湘潭市
Q38234,长崎市,Nagasaki,Nagasaki,"core city in Kyushu, Japan",位於日本九州西部的都市，為長崎縣縣治,cidade japonesa,"(32.74952777777778, 129.87963888888888)",32.749528,129.879639,Q169376,Nagasaki Prefecture,長崎縣,Q17,Japan,Nagasaki / 长崎市
Q2833,弗赖堡,Freiburg im Breisgau,Freiburg im Breisgau,"large city in Baden-Württemberg, Germany",德国巴登-符腾堡州城市,"cidade de Baden-Württemberg, na Alemanha","(47.995, 7.85)",47.995,7.85,Q8167,Freiburg Government Region,弗赖堡行政区,Q183,Germany,Freiburg im Breisgau / 弗赖堡
Q171943,南昌市,Nanchang,Nanchang,"capital of Jiangxi province, China",中國江西省省會，地級市,,"(28.684167, 115.887222)",28.684167,115.887222,Q57052,Jiangxi,江西省,Q148,People's Republic of China,Nanchang / 南昌市
Q15905472,平阳府,Pingyang Fu,,historical administrative division of China,中國古代行政區劃,,,,,Q9574531,Shansi,山西省,Q814959,Beiyang Government,Pingyang Fu / 平阳府
Q1365421,天主教兰斯总教区,Roman Catholic Archdiocese of Reims,Arquidiocese de Reims,Catholic archdiocese in France,维基媒体列表条目,,,,,,,,Q142,France,Roman Catholic Archdiocese of Reims / 天主教兰斯总教区
Q216077,九江市,Jiujiang,Jiujiang,"prefecture-level city in Jiangxi, China",中国江西省西北部的地级市,,"(29.70475, 116.00206)",29.70475,116.00206,Q57052,Jiangxi,江西省,Q148,People's Republic of China,Jiujiang / 九江市
Q419,秘鲁,Peru,Peru,sovereign state in South America,南美洲國家,país na América do Sul,"(-9.4, -76)",-9.4,-76.0,,,,Q419,Peru,Peru / 秘鲁
Q6482357,,Lampacau,Lampacau,,,,"(22.077, 113.44)",22.077,113.44,,,,Q148,People's Republic of China,Lampacau /


In [6]:
import time
import pywikibot
import pandas as pd
from dehergne_util import locations_wikidata_info_file

site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()

entities_data = []
entities_from_cache = []
problems = []  # to store any problems encountered during the fetching process
for wikidata_id in wikidata_entities:
    if wikidata_id == 'Q11146687':
        pass
    # check if the wikidata_id is already in the cache
    if wikidata_id in entities_cache.index:
        entities_from_cache.append(wikidata_id)
        # print(f"{wikidata_id} already in cache, skipping...")
        continue
    entity_data = {}
    # Fetch the item from Wikidata
    entity_data['wikidata_id'] = wikidata_id
    print(f"Fetching {wikidata_id} from Wikidata...")
    # Use the repository to get the item
    try:
        item = pywikibot.ItemPage(repo, wikidata_id)
    except Exception as e:
        problems.append({'wikidata_id': wikidata_id, 'exception': str(e)})
        continue
    item_dict = item.get()
    entity_data['chinese_label'] = item_dict['labels'].get('zh', None)
    entity_data['english_label'] = item_dict['labels'].get('en', None)
    entity_data['portuguese_label'] = item_dict['labels'].get('pt', None)
    entity_data['english_description'] = item_dict['descriptions'].get('en', None)
    entity_data['chinese_description'] = item_dict['descriptions'].get('zh', None)
    entity_data['portuguese_description'] = item_dict['descriptions'].get('pt', None)
    # get the coordinates if they exist
    coord = None
    if 'P625' in item_dict['claims']:
        coord_claim = item_dict['claims']['P625'][0].getTarget()
        coord = (coord_claim.lat, coord_claim.lon)
    entity_data['coordinates'] = coord
    # store lat and lon in separate keys
    entity_data['latitude'] = coord[0] if coord else None
    entity_data['longitude'] = coord[1] if coord else None
    # get the administrative territorial entity if it exists (P131)
    administrative_entity = None
    administrative_entity_label_en = None
    administrative_entity_label_zh = None
    administrative_entity_id = None
    administrative_entity_claims = item_dict['claims'].get('P131', [])
    if administrative_entity_claims:
        administrative_entity = administrative_entity_claims[0].getTarget()
        administrative_entity_id = administrative_entity.id
        administrative_entity_label_en = administrative_entity.get()['labels'].get('en', None)
        administrative_entity_label_zh = administrative_entity.get()['labels'].get('zh', None)
    # store administrative information in entity_data
    entity_data['administrative_entity_id'] = administrative_entity_id
    entity_data['administrative_entity_label_en'] = administrative_entity_label_en
    entity_data['administrative_entity_label_zh'] = administrative_entity_label_zh
    # get the country
    country = None
    country_label = None
    country_id = None
    country_claims = item_dict['claims'].get('P17', [])
    if country_claims:
        country = country_claims[0].getTarget()
        country_id = country.id
        country_label = country.get()['labels'].get('en', None)
    entity_data['country_id'] = country_id
    entity_data['country_label'] = country_label
    # append the entity data to the list
    entities_data.append(entity_data)

    print(f"Fetched {wikidata_id} with labels: {entity_data['english_label']}, {entity_data['portuguese_label']}, {entity_data['chinese_label']}, {entity_data['administrative_entity_label_en']}, {entity_data['administrative_entity_label_zh']}, {entity_data['country_label']}")
    time.sleep(0.5)  # to avoid hitting the API too hard

# Convert the list of dictionaries to a DataFrame
entities_df = pd.DataFrame(entities_data)
if len(entities_df) == 0:
    print("No new entities found.")
    entities_df = entities_cache  # use the cached data if no new entities found
else:
    print(f"Fetched {len(entities_df)} new entities from Wikidata.")
    entities_df.set_index('wikidata_id', inplace=True)

# get the rows entities_cache in entities_from_cache list of ids
# this removes from cache entities not in the current run
entities_cache = entities_cache[entities_cache.index.isin(entities_from_cache)]
# append the new data to the existing cache
entities_df = pd.concat([entities_cache, entities_df], axis=0)



Fetching No wikidata from Wikidata...
Fetching Q950944 from Wikidata...
Fetched Q950944 with labels: Yongji, None, 永济市, Yuncheng, 运城市, People's Republic of China
Fetched 1 new entities from Wikidata.


In [7]:
# print the problems encountered one per line
for problem in problems:
    print(f"Problem with {problem['wikidata_id']}: {problem['exception']}")

Problem with No wikidata: 'No wikidata' is not a valid item page title


In [8]:
# show duplicates in the entities_df use index
duplicates = entities_df.index[entities_df.index.duplicated()].unique()
duplicates

Index([], dtype='object', name='wikidata_id')

In [9]:
# remove duplicates from entities_df
print(f"Removing {len(duplicates)} duplicates from entities_df, before removal: {len(entities_df)} rows.")
entities_df = entities_df[~entities_df.index.duplicated(keep='first')]
print(f"After removal: {len(entities_df)} rows.")

Removing 0 duplicates from entities_df, before removal: 1075 rows.
After removal: 1075 rows.


In [10]:
entities_df.info()
entities_df.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 1075 entries, Q90 to Q950944
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   chinese_label                   1016 non-null   object 
 1   english_label                   1070 non-null   object 
 2   portuguese_label                889 non-null    object 
 3   english_description             1057 non-null   object 
 4   chinese_description             770 non-null    object 
 5   portuguese_description          619 non-null    object 
 6   coordinates                     1036 non-null   object 
 7   latitude                        1036 non-null   float64
 8   longitude                       1036 non-null   float64
 9   administrative_entity_id        971 non-null    object 
 10  administrative_entity_label_en  967 non-null    object 
 11  administrative_entity_label_zh  856 non-null    object 
 12  country_id                      10

Unnamed: 0_level_0,chinese_label,english_label,portuguese_label,english_description,chinese_description,portuguese_description,coordinates,latitude,longitude,administrative_entity_id,administrative_entity_label_en,administrative_entity_label_zh,country_id,country_label,label
wikidata_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Q90,巴黎,Paris,Paris,capital city and largest city of France,法國首都及最大都市,capital e maior cidade da França,"(48.85666666666667, 2.352222222222222)",48.856667,2.352222,Q16665915,Grand Paris,大巴黎都会区,Q142,France,Paris / 巴黎
Q41185,卡昂,Caen,Caen,"commune in Calvados, France",法国卡尔瓦多斯省市镇与省会,Comuna francesa,"(49.18138888888889, -0.3636111111111111)",49.181389,-0.363611,Q700794,arrondissement of Caen,卡昂区,Q142,France,Caen / 卡昂
Q869,泰國,Thailand,Tailândia,country in Southeast Asia,東南亞國家,país da Ásia,"(14, 101)",14.0,101.0,,,,Q869,Thailand,Thailand / 泰國
Q12193,布雷斯特,Brest,Brest,"port city in the Finistère department, Brittan...",法国菲尼斯泰尔省市镇与副省会,comuna francesa,"(48.39, -4.4869444444444)",48.39,-4.486944,Q700737,arrondissement of Brest,布雷斯特区,Q142,France,Brest / 布雷斯特
Q639421,本地治里市,Pondicherry,,"city in the Union Territory of Puducherry, India",,,"(11.93, 79.83)",11.93,79.83,Q66743,Puducherry,本地治里,Q668,India,Pondicherry / 本地治里市
Q1352,金奈,Chennai,Chennai,"city and state capital of Tamil Nadu, India","印度城市,泰米爾納德邦首府","cidade e capital do estado de Tamil Nadu, Índia","(13.0825, 80.275)",13.0825,80.275,Q15116,Chennai district,金奈縣,Q668,India,Chennai / 金奈
Q622778,東京,Tonkin,Tonkin,"northern part of Vietnam, to the west of the G...",印度支那的历史地名，位于今日越南北部,,"(21, 106)",21.0,106.0,,,,Q881,Vietnam,Tonkin / 東京
Q16572,广州市,Guangzhou,Cantão,"capital city of Guangdong Province, China",广东省的省会,capital da província chinesa de Cantão,"(23.13, 113.26)",23.13,113.26,Q15175,Guangdong,广东省,Q148,People's Republic of China,Guangzhou / 广州市
Q15175,广东省,Guangdong,Cantão,province of China,中华人民共和国省份,província da China,"(23.4, 113.5)",23.4,113.5,Q148,People's Republic of China,中华人民共和国,Q148,People's Republic of China,Guangdong / 广东省
Q155,巴西,Brazil,Brasil,country in South America,南美洲國家,país na América do Sul,"(-14, -53)",-14.0,-53.0,,,,Q200464,Portuguese Empire,Brazil / 巴西


In [11]:
entities_df.loc['Q1197421',]

chinese_label                                                     武进区
english_label                                          Wujin District
portuguese_label                                                  NaN
english_description               district of Jiangsu Province, China
chinese_description                                        江蘇省常州市的市辖区
portuguese_description                                            NaN
coordinates                                      (31.70204, 119.9377)
latitude                                                     31.70204
longitude                                                    119.9377
administrative_entity_id                                       Q57970
administrative_entity_label_en                              Changzhou
administrative_entity_label_zh                                    常州市
country_id                                                       Q148
country_label                              People's Republic of China
label               

In [12]:
from dehergne_util import locations_wikidata_info_file
entities_df.to_excel(locations_wikidata_info_file, index=True)
print(f"Saved {len(entities_df)} entities to {locations_wikidata_info_file}")

Saved 1075 entities to ../inferences/wikidata-references/locations_wikidata_info.xlsx


Entries with no coordinates

In [13]:
%pip install plotly nbformat

.bash_profile RUN!
Note: you may need to restart the kernel to use updated packages.


In [14]:
import plotly.express as px
import plotly.io as pio

# Create a 'label' column by concatenating 'english_label' and 'chinese_label'
# Handle potential None values by replacing them with empty strings
entities_df['label'] = entities_df['english_label'].fillna('') + ' / ' + entities_df['chinese_label'].fillna('')

# Drop rows where latitude or longitude is NaN before plotting
entities_df_map = entities_df.dropna(subset=['latitude', 'longitude'])

fig = px.scatter_map(
    entities_df_map,
    lat="latitude",
    lon="longitude",
    hover_name="label",  # Use the new 'label' column for hover text
    text="label",        # Show the label on the map
    zoom=3,

)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(autosize=True)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
#fig.update_traces(textfont=dict(size=6))
fig.show(config={"responsive": True})

# Export the figure to an HTML file
pio.write_html(fig,
                config={"responsive": True},
                include_plotlyjs=True,
                file='../inferences/map_all_locations.html',
                auto_open=True)