In [1]:
import wikipedia
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads
import requests
import json
import numpy as np
from tqdm import tqdm_notebook
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
from datetime import date

In [2]:
def get_coordinates(x):
    try:
        x = x['coordinates']
        lat = x[0]['lat']
        lon = x[0]['lon']
        return lat, lon
    except:
        return 0,0
    
def pageviews_sum(x):
    if type(x) == dict:
        dict_values = [k for k in x.values() if k is not None]
        return sum(dict_values)
    else:
        return x

__Load city grid for loading data__

In [3]:
grid = gpd.read_file('data/grid_500.geojson')
grid[['left', 'bottom', 'right', 'top']] = grid['geometry'].bounds
grid.head(6)

Unnamed: 0,id,left,top,right,bottom,geometry
0,1,56.180876,58.035073,56.189444,58.030529,"POLYGON ((56.18088 58.03502, 56.18934 58.03507..."
1,2,56.180979,58.030583,56.189546,58.026038,"POLYGON ((56.18098 58.03053, 56.18944 58.03058..."
2,3,56.181081,58.026092,56.189647,58.021548,"POLYGON ((56.18108 58.02604, 56.18955 58.02609..."
3,4,56.181184,58.021602,56.189749,58.017057,"POLYGON ((56.18118 58.02155, 56.18965 58.02160..."
4,5,56.181287,58.017111,56.18985,58.012567,"POLYGON ((56.18129 58.01706, 56.18975 58.01711..."
5,6,56.181389,58.012621,56.189952,58.008076,"POLYGON ((56.18139 58.01257, 56.18985 58.01262..."


In [5]:
grid = grid.loc[200:800]

In [4]:
len(grid)

210

__API: generator = geosearch__

In [5]:
allData2 = pd.DataFrame()
API_URL = 'https://ru.wikipedia.org/w/api.php?action=query&generator=geosearch&ggslimit=500&'

for i in tqdm_notebook(grid.index):
    try:
        maxlon = grid.loc[i,'right']
        maxlat = grid.loc[i,'top']
        minlon = grid.loc[i,'left']
        minlat = grid.loc[i,'bottom']
        cell_id = grid.loc[i, 'id']

        params={'ggsbbox': '{}|{}|{}|{}'.format(maxlat, minlon, minlat, maxlon),
                'redirects': 1,
                'prop': 'coordinates|pageviews|langlinkscount|info',
                'format':'json'}

        response = requests.get(API_URL, params=params)
        response_data = response.json()

        df_pages = pd.DataFrame.from_dict(response_data['query']['pages'], orient="index")

        df_pages[['lat', 'lon']] = df_pages.apply(get_coordinates, axis = 1, result_type="expand")
        
        df_pages = df_pages.drop(labels = 'index', axis =1 )
        df_pages['pageviews'] = np.where(df_pages['pageviews'].isnull(), 0, df_pages['pageviews'])
        df_pages['count_pageviews'] = df_pages['pageviews'].apply(lambda x: pageviews_sum(x))

        df_pages['cell_id'] = cell_id
        allData2 = allData2.append(df_pages)
    except:
        print(cell_id)

HBox(children=(IntProgress(value=0, max=210), HTML(value='')))

1
2
3
4
5
8
10
11
14
15
16
17
18
23
24
25
26
28
29
30
31
32
33
36
37
39
41
43
44
45
46
48
49
50
51
52
53
54
56
58
59
61
64
66
67
68
69
70
71
72
73
74
79
80
81
82
83
84
85
86
87
88
93
94
95
96
97
98
99
100
101
107
109
110
111
112
113
114
115
121
123
127
128
130
133
135
136
137
138
139
140
141
142
143
147
148
149
150
151
152
153
154
155
156
157
159
161
162
163
164
165
166
167
169
170
172
176
177
178
179
180
181
183
184
186
187
188
190
191
192
193
194
195
196
198
199
200
201
202
204
205
207
208
209
210



In [6]:
len(allData2)

197

In [7]:
#сохраняем данные из api generator geosearch
allData2.to_csv('bbox_wiki_perm.csv', index = False)

__API: list=geosearch__

In [8]:
API_URL = 'https://ru.wikipedia.org/w/api.php?action=query'

allData = pd.DataFrame()
for i in tqdm_notebook(grid.index):
    try: 
        maxlon = grid.loc[i,'right']
        maxlat = grid.loc[i,'top']
        minlon = grid.loc[i,'left']
        minlat = grid.loc[i,'bottom']
        cell_id = grid.loc[i, 'id']

        params={'list': 'geosearch',
                'gsbbox': '{}|{}|{}|{}'.format(maxlat, minlon, minlat, maxlon),
                'gslimit': 500,
                'redirects': 1,
                'prop': 'coordinates|categories|pageviews|langlinkscount',
                'format':'json'}

        response = requests.get(API_URL, params=params)
        response_data = response.json()
        geodata = pd.DataFrame(response_data['query']['geosearch'])
        geodata['cell_id'] = cell_id
        allData = allData.append(geodata)
    except:
        print(i)

HBox(children=(IntProgress(value=0, max=210), HTML(value='')))




In [9]:
len(allData)

197

In [10]:
allData.head()

Unnamed: 0,cell_id,pageid,ns,title,lat,lon,dist,primary
0,6,2145318.0,0.0,Улица Дзержинского (Пермь),58.00889,56.18333,212.8,
0,7,293990.0,0.0,Пермский государственный университет,58.0075,56.18667,190.1,
1,7,8615761.0,0.0,Здание бывшего ночлежного дома имени Е. И. Меш...,58.007,56.1886,209.5,
2,7,5457807.0,0.0,Естественнонаучный институт Пермского государс...,58.008,56.1888,297.6,
0,9,294140.0,0.0,Школа № 146 (Пермь),57.9975,56.1838,145.8,


In [11]:
allData.to_csv('wiki_list_geosearch_500_perm.csv', index = False)

__Load data using SparQL__

In [12]:
#получаем координаты bbox для сетки
lonmin, latmin, lonmax, latmax = grid.unary_union.bounds

In [38]:
sparql_query = """
SELECT ?place ?placeLabel ?location ?fame WHERE {
  SERVICE wikibase:box {
    ?place wdt:P625 ?location .
    bd:serviceParam wikibase:cornerWest "Point(""" + str(lonmin) + """ """+  str(latmin) +""")"^^geo:wktLiteral .
    bd:serviceParam wikibase:cornerEast "Point(""" + str(lonmax) + """ """+  str(latmax) +""")"^^geo:wktLiteral .
  }
  ?place wikibase:sitelinks ?fame
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "ru" . 
  }
}
"""
res = return_sparql_query_results(sparql_query)

In [39]:
#из данных sparql извлекаем нужные поля
res_df = pd.DataFrame.from_dict(res['results']['bindings'])

res_df['place'] = res_df['place'].apply(lambda x: x['value'])
res_df['location'] = res_df['location'].apply(lambda x: x['value'])
res_df['fame'] = res_df['fame'].apply(lambda x: x['value'])
res_df['placeLabel'] = res_df['placeLabel'].apply(lambda x: x['value'])

res_df.head()

Unnamed: 0,place,location,fame,placeLabel
0,http://www.wikidata.org/entity/Q915,Point(56.248888888 58.013888888),125,Пермь
1,http://www.wikidata.org/entity/Q115035,Point(56.246052 58.015975),10,Пермский академический театр оперы и балета им...
2,http://www.wikidata.org/entity/Q846866,Point(56.248888888 58.013888888),23,Пермская область
3,http://www.wikidata.org/entity/Q1076316,Point(56.2694 58.0168),5,Стикс
4,http://www.wikidata.org/entity/Q1124098,Point(56.248888888 58.013888888),26,Пермская губерния


In [40]:
len(res_df)

184

In [41]:
res_df.to_csv('wiki_sparql_test_perm.csv', index = False)

__Get pageviews for wikidata entities__

In [42]:
for i in tqdm_notebook(res_df.index):
    try:
        wiki_id = res_df.loc[i, 'place'].split('/')[-1]
        page_url = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks'
        params = {'ids': wiki_id,
                 'sitefilter':'ruwiki'}
        response = requests.get(page_url, params=params)
        response_data = response.json()


        page_title = response_data['entities'][wiki_id]['sitelinks']['ruwiki']['title']

        title_url = 'https://ru.wikipedia.org/w/api.php?action=query&'

        params = {'titles': page_title,
                  'format': 'json',
                  'prop': 'categories|pageviews|langlinkscount'}
        response = requests.get(title_url, params=params)
        response_data = response.json()

        df_pages = pd.DataFrame.from_dict(response_data['query']['pages'], orient="index")
        res_df.loc[i, 'pageid'] = int(df_pages['pageid'])
        res_df.loc[i, 'pageviews'] = np.where(df_pages['pageviews'].isnull(), 0, df_pages['pageviews'])
        res_df.loc[i, 'count_pageviews'] = int(df_pages['pageviews'].apply(lambda x: pageviews_sum(x)))
    except:
        print(i)

HBox(children=(IntProgress(value=0, max=184), HTML(value='')))

54
69
70
71
73
75
76
77
78
79
163
164
165
166
168
169
170
171
172
173
174
175
176
177
178
179
180
183



__Combine all wiki sourse__

In [43]:
allData['pageid'] = allData['pageid'].astype(int)

In [44]:
allData2.columns

Index(['pageid', 'ns', 'title', 'coordinates', 'pageviews', 'contentmodel',
       'pagelanguage', 'pagelanguagehtmlcode', 'pagelanguagedir', 'touched',
       'lastrevid', 'length', 'lat', 'lon', 'count_pageviews', 'cell_id',
       'langlinkscount'],
      dtype='object')

In [45]:
allData.columns

Index(['cell_id', 'pageid', 'ns', 'title', 'lat', 'lon', 'dist', 'primary'], dtype='object')

In [46]:
#объединяем первые две выгрузки generator geosearch и list geosearch
combine_geosearch = pd.merge(allData2, allData, on = ['pageid', 'title'], how = 'left')


#сохраняем одну пару координат в колонках lon и lat
combine_geosearch['lon'] = np.where((combine_geosearch['lon_x'] == 0) & \
                           (combine_geosearch['lon_y']!=0), combine_geosearch['lon_y'], combine_geosearch['lon_x'])

combine_geosearch['lat'] = np.where((combine_geosearch['lat_x'] == 0) & \
                           (combine_geosearch['lat_y']!=0), combine_geosearch['lat_y'], combine_geosearch['lat_x'])

cols = ['pageid', 'title', 'lon', 'lat', 'pageviews', 'count_pageviews']
combine_geosearch = combine_geosearch[cols]

combine_geosearch.head()

Unnamed: 0,pageid,title,lon,lat,pageviews,count_pageviews
0,2145318,Улица Дзержинского (Пермь),56.18333,58.00889,"{'2021-02-20': 2, '2021-02-21': 2, '2021-02-22...",129
1,293990,Пермский государственный университет,56.18667,58.0075,"{'2021-02-20': 63, '2021-02-21': 64, '2021-02-...",4450
2,5457807,Естественнонаучный институт Пермского государс...,56.1888,58.008,"{'2021-02-20': 1, '2021-02-21': 1, '2021-02-22...",115
3,8615761,Здание бывшего ночлежного дома имени Е. И. Меш...,56.1886,58.007,"{'2021-02-20': 4, '2021-02-21': 1, '2021-02-22...",74
4,294140,Школа № 146 (Пермь),56.1838,57.9975,"{'2021-02-20': 14, '2021-02-21': 6, '2021-02-2...",706


In [47]:
#готовим df из sparql к объединению с combine_geosearch
res_df = res_df.dropna(subset = ['pageid', 'placeLabel'])
res_df['pageid'] = res_df['pageid'].astype(int)
#placeLabel переименовываем в title так как в предыдущем датасете название места хранится в поле title
res_df = res_df.rename(columns = {'placeLabel':'title'})

res_df['lon'] = res_df['location'].apply(lambda x: loads(x).x)
res_df['lat'] = res_df['location'].apply(lambda x: loads(x).y)

res_df = res_df[cols]

In [62]:
#объединяем geosearch и sparql выгрузки
allWiki = combine_geosearch.append(res_df)
allWiki = allWiki.reset_index(drop=True)

#удаляем возникшие дубликаты по pageid
allWiki = allWiki.loc[allWiki.groupby(['pageid']).count_pageviews.idxmax()]

len(allWiki)

201

In [63]:
allWiki.head()

Unnamed: 0,pageid,title,lon,lat,pageviews,count_pageviews
203,1105,Пермь,56.248889,58.013889,"{'2021-02-20': 1331, '2021-02-21': 1297, '2021...",90242.0
205,13794,Пермская область,56.248889,58.013889,"{'2021-02-20': 72, '2021-02-21': 65, '2021-02-...",5030.0
207,67181,Пермская губерния,56.248889,58.013889,"{'2021-02-20': 65, '2021-02-21': 68, '2021-02-...",5022.0
229,165250,Пермский педагогический университет,56.255,58.009,"{'2021-02-20': 4, '2021-02-21': 3, '2021-02-22...",594.0
199,165559,Мотовилихинские заводы,56.3,58.035,"{'2021-02-20': 54, '2021-02-21': 54, '2021-02-...",3989.0


In [64]:
allWiki.to_csv('data/wikipedia_geodata_{}_test.csv'.format(date.today().strftime("%Y%m%d")), index = False)