In [1]:
import wikipedia
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads
import requests
import json
import numpy as np
from tqdm import tqdm_notebook
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
from datetime import date

In [2]:
def get_coordinates(x):
    try:
        x = x['coordinates']
        lat = x[0]['lat']
        lon = x[0]['lon']
        return lat, lon
    except:
        return 0,0
    
def pageviews_sum(x):
    if type(x) == dict:
        dict_values = [k for k in x.values() if k is not None]
        return sum(dict_values)
    else:
        return x

In [3]:
grid = gpd.read_file('data/test500.geojson')
grid[['left', 'bottom', 'right', 'top']] = grid['geometry'].bounds
grid.head(6)

Unnamed: 0,id,geometry,left,bottom,right,top
0,5979,"POLYGON ((37.55814 55.80167, 37.56263 55.80167...",37.558143,55.799146,37.562635,55.801671
1,5980,"POLYGON ((37.55814 55.79915, 37.56263 55.79915...",37.558143,55.796622,37.562635,55.799146
2,5981,"POLYGON ((37.55814 55.79662, 37.56263 55.79662...",37.558143,55.794097,37.562635,55.796622
3,5982,"POLYGON ((37.55814 55.79410, 37.56263 55.79410...",37.558143,55.791572,37.562635,55.794097
4,5983,"POLYGON ((37.55814 55.79157, 37.56263 55.79157...",37.558143,55.789046,37.562635,55.791572
5,5984,"POLYGON ((37.55814 55.78905, 37.56263 55.78905...",37.558143,55.786521,37.562635,55.789046


__API: generator = geosearch__

In [None]:
allData2 = pd.DataFrame()
API_URL = 'https://ru.wikipedia.org/w/api.php?action=query&generator=geosearch&ggslimit=500&'

for i in tqdm_notebook(grid.index):
    try:
        maxlon = grid.loc[i,'right']
        maxlat = grid.loc[i,'top']
        minlon = grid.loc[i,'left']
        minlat = grid.loc[i,'bottom']
        cell_id = grid.loc[i, 'id']

        params={'ggsbbox': '{}|{}|{}|{}'.format(maxlat, minlon, minlat, maxlon),
                'redirects': 1,
                'prop': 'coordinates|pageviews|langlinkscount|info',
                'format':'json'}

        response = requests.get(API_URL, params=params)
        response_data = response.json()

        df_pages = pd.DataFrame.from_dict(response_data['query']['pages'], orient="index")

        df_pages[['lat', 'lon']] = df_pages.apply(get_coordinates, axis = 1, result_type="expand")
        
        df_pages = df_pages.drop(labels = 'index', axis =1 )
        df_pages['pageviews'] = np.where(df_pages['pageviews'].isnull(), 0, df_pages['pageviews'])
        df_pages['count_pageviews'] = df_pages['pageviews'].apply(lambda x: pageviews_sum(x))

        df_pages['cell_id'] = cell_id
        allData2 = allData2.append(df_pages)
    except:
        print(cell_id)

In [7]:
len(allData2)

3966

In [8]:
allData2.to_csv('bbox_wiki_500_feb_test.csv', index = False)

__API: list=geosearch__

In [9]:
API_URL = 'https://ru.wikipedia.org/w/api.php?action=query'

allData = pd.DataFrame()
for i in tqdm_notebook(grid.index):
    try: 
        maxlon = grid.loc[i,'right']
        maxlat = grid.loc[i,'top']
        minlon = grid.loc[i,'left']
        minlat = grid.loc[i,'bottom']
        cell_id = grid.loc[i, 'id']

        params={'list': 'geosearch',
                'gsbbox': '{}|{}|{}|{}'.format(maxlat, minlon, minlat, maxlon),
                'gslimit': 500,
                'redirects': 1,
                'prop': 'coordinates|categories|pageviews|langlinkscount',
                'format':'json'}

        response = requests.get(API_URL, params=params)
        response_data = response.json()
        geodata = pd.DataFrame(response_data['query']['geosearch'])
        geodata['cell_id'] = cell_id
        allData = allData.append(geodata)
    except:
        print(i)

HBox(children=(IntProgress(value=0, max=924), HTML(value='')))

553



In [10]:
len(allData)

3959

In [13]:
allData.head()

Unnamed: 0,pageid,ns,title,lat,lon,dist,primary,cell_id
0,2947224.0,0.0,Улица Юннатов (Москва),55.800111,37.561086,54.7,,5979
1,2916119.0,0.0,Старый Петровско-Разумовский проезд,55.8,37.55972,61.8,,5979
2,2949219.0,0.0,Мишин проезд,55.79944,37.56056,108.3,,5979
0,2943281.0,0.0,Верхняя Масловка,55.794689,37.562283,139.9,,5981
0,4705216.0,0.0,Информзащита,55.7932,37.5595,68.9,,5982


In [12]:
allData.to_csv('wiki_list_geosearch_500_test.csv', index = False)

__Load data using SparQL__

In [16]:
lonmin, latmin, lonmax, latmax = grid.unary_union.bounds

In [31]:
sparql_query = """
SELECT ?place ?placeLabel ?location ?fame WHERE {
  SERVICE wikibase:box {
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point(""" + str(lonmin) + """ """+  str(latmin) +""")"^^geo:wktLiteral .
    bd:serviceParam wikibase:cornerEast "Point(""" + str(lonmax) + """ """+  str(latmax) +""")"^^geo:wktLiteral .
  }
  ?place wikibase:sitelinks ?fame
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "ru" . 
  }
}
"""
res = return_sparql_query_results(sparql_query)

In [32]:
res_df = pd.DataFrame.from_dict(res['results']['bindings'])

res_df['place'] = res_df['place'].apply(lambda x: x['value'])
res_df['location'] = res_df['location'].apply(lambda x: x['value'])
res_df['fame'] = res_df['fame'].apply(lambda x: x['value'])
res_df['placeLabel'] = res_df['placeLabel'].apply(lambda x: x['value'])

res_df.head()

Unnamed: 0,place,location,fame,placeLabel
0,http://www.wikidata.org/entity/Q613676,Point(37.577222222 55.761944444),30,Московский зоопарк
1,http://www.wikidata.org/entity/Q645097,Point(37.577244 55.760205),22,Краснопресненская
2,http://www.wikidata.org/entity/Q657306,Point(37.571636 55.767161),21,Собор Непорочного Зачатия Пресвятой Девы Марии
3,http://www.wikidata.org/entity/Q844124,Point(37.573146 55.754935),30,Дом правительства РФ
4,http://www.wikidata.org/entity/Q932748,Point(37.5775 55.7675),6,Посольство Польши в Москве


In [33]:
len(res_df)

3696

In [34]:
res_df.to_csv('wiki_sparql_test.csv', index = False)

__Get pageviews for wikidata entities__

In [110]:
for i in tqdm_notebook(res_df.index):
    try:
        wiki_id = res_df.loc[i, 'place'].split('/')[-1]
        page_url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks'
        params = {'ids': wiki_id,
                 'sitefilter':'ruwiki'}
        response = requests.get(page_url, params=params)
        response_data = response.json()


        page_title = response_data['entities'][wiki_id]['sitelinks']['ruwiki']['title']

        title_url = 'https://ru.wikipedia.org/w/api.php?action=query&'

        params = {'titles': page_title,
                  'format': 'json',
                  'prop': 'categories|pageviews|langlinkscount'}
        response = requests.get(title_url, params=params)
        response_data = response.json()

        df_pages = pd.DataFrame.from_dict(response_data['query']['pages'], orient="index")
        res_df.loc[i, 'pageid'] = int(df_pages['pageid'])
        res_df.loc[i, 'pageviews'] = np.where(df_pages['pageviews'].isnull(), 0, df_pages['pageviews'])
        res_df.loc[i, 'count_pageviews'] = int(df_pages['pageviews'].apply(lambda x: pageviews_sum(x)))
    except:
        print(i)

HBox(children=(IntProgress(value=0, max=3696), HTML(value='')))

83
86
87
89
90
92
93
95
152
176
177
187
190
191
192
193
194
195
196
197
198
199
272
275
285
286
289
291
295
310
369
376
381
382
383
384
385
396
433
453
480
482
485
487
488
489
490
491
492
493
494
495
496
497
498
499
575
576
577
578
579
581
583
595
669
683
686
687
694
695
696
697
698
699
725
764
775
776
777
779
782
783
787
790
796
797
798
799
873
892
893
896
897
898
899
961
962
974
975
976
977
993
994
995
996
997
998
999
1090
1189
1190
1191
1192
1193
1194
1195
1196
1197
1279
1280
1281
1286
1287
1293
1295
1386
1387
1390
1481
1482
1483
1485
1486
1491
1492
1493
1494
1495
1496
1497
1498
1499
1565
1573
1575
1577
1586
1593
1594
1595
1596
1597
1598
1660
1683
1689
1690
1691
1692
1693
1694
1695
1698
1699
1775
1786
1788
1789
1798
1799
1827
1857
1888
1891
1892
1893
1894
1895
1896
1897
1898
1899
1912
1975
1976
1981
1987
1988
1989
1990
1996
1997
1999
2059
2060
2061
2071
2072
2073
2074
2075
2076
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2175
2177
2178
2189
2

__Combine all wiki sourse__

In [46]:
allData['pageid'] = allData['pageid'].astype(int)

In [105]:
combine_geosearch = pd.merge(allData2, allData, on = ['pageid', 'title'], how = 'left')

combine_geosearch['lon'] = np.where((combine_geosearch['lon_x'] == 0) & \
                           (combine_geosearch['lon_y']!=0), combine_geosearch['lon_y'], combine_geosearch['lon_x'])

combine_geosearch['lat'] = np.where((combine_geosearch['lat_x'] == 0) & \
                           (combine_geosearch['lat_y']!=0), combine_geosearch['lat_y'], combine_geosearch['lat_x'])

cols = ['pageid', 'title', 'lon', 'lat', 'pageviews', 'count_pageviews']
combine_geosearch = combine_geosearch[cols]

combine_geosearch.head()

Unnamed: 0,pageid,title,lon,lat,pageviews,count_pageviews
0,2916119,Старый Петровско-Разумовский проезд,37.55972,55.8,"{'2020-12-23': 5, '2020-12-24': 1, '2020-12-25...",91
1,2947224,Улица Юннатов (Москва),37.561086,55.800111,"{'2020-12-23': 1, '2020-12-24': 4, '2020-12-25...",130
2,2949219,Мишин проезд,37.56056,55.79944,"{'2020-12-23': 0, '2020-12-24': 0, '2020-12-25...",29
3,2943281,Верхняя Масловка,37.562283,55.794689,"{'2020-12-23': 6, '2020-12-24': 7, '2020-12-25...",409
4,4705216,Информзащита,37.5595,55.7932,"{'2020-12-23': 12, '2020-12-24': 11, '2020-12-...",413


In [117]:
res_df = res_df.dropna(subset = ['pageid', 'placeLabel'])
res_df['pageid'] = res_df['pageid'].astype(int)
res_df = res_df.rename(columns = {'placeLabel':'title'})

res_df['lon'] = res_df['location'].apply(lambda x: loads(x).x)
res_df['lat'] = res_df['location'].apply(lambda x: loads(x).y)

res_df = res_df[cols]

In [138]:
allWiki = combine_geosearch.append(res_df)

allWiki = allWiki.drop_duplicates(subset = ['pageid', 'title'])

len(allWiki)

In [145]:
allWiki.to_csv('data/wikipedia_geodata_{}_test.csv'.format(date.today().strftime("%Y%m%d")), index = False)