#### URL's with Moscow Neighbouthoods data

In [None]:
mosopen_url = 'http://mosopen.ru/regions'
wikipedia_url = 'https://ru.wikipedia.org/wiki/Районы_и_поселения_Москвы'

In [2]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd

#### Scraping neighbourhoods locations

In [3]:
mosopen_data  = requests.get(mosopen_url).text 
soup = BeautifulSoup(mosopen_data, 'html5lib')
table = soup.find('table', {'class': 'regions_list'}) 

df = pd.DataFrame(columns=['Name', 'Latitude', 'Longitude', 'Area', 'Population', 'Density', 'URL', 'Borders_URL'])

for a in table.find_all('a'):
    href = a.get('href')
    name = a.get_text()
    df = df.append({'Name':name, 'URL':href}, ignore_index=True)

df

Unnamed: 0,Name,Latitude,Longitude,Area,Population,Density,URL,Borders_URL
0,Академический,,,,,,http://mosopen.ru/region/akademicheskij,
1,Алексеевский,,,,,,http://mosopen.ru/region/alekseevskij,
2,Алтуфьевский,,,,,,http://mosopen.ru/region/altufevskij,
3,Арбат,,,,,,http://mosopen.ru/region/arbat,
4,Аэропорт,,,,,,http://mosopen.ru/region/aeroport,
...,...,...,...,...,...,...,...,...
120,Щукино,,,,,,http://mosopen.ru/region/shchukino,
121,Южнопортовый,,,,,,http://mosopen.ru/region/yuzhnoportovyj,
122,Якиманка,,,,,,http://mosopen.ru/region/yakimanka,
123,Ярославский,,,,,,http://mosopen.ru/region/yaroslavskij,


#### For each location go to URL and scrape neighbourhood coordinates

In [4]:
import re

In [5]:

for name, url in list(df[['Name','URL']].itertuples(index=False)):
    print('Processing {} neighbourhood ...'.format(name))
    data  = requests.get(url).text 
    soup = BeautifulSoup(data, 'html5lib')

    pattern = re.compile(r'var center = new YMaps.GeoPoint\((.*),(.*)\);')

    script = soup.find('script', text=pattern)

    match = pattern.search(script.text)
    longitude = match.group(1)
    latitude = match.group(2)

    df.loc[df['Name'] == name, 'Latitude'] = latitude
    df.loc[df['Name'] == name, 'Longitude'] = longitude

    pattern = re.compile(r'var ml = new YMaps.YMapsML\("(.*)"\);')
    script = soup.find('script', text=pattern)

    match = pattern.search(script.text)
    borders_url = match.group(1)

    df.loc[df['Name'] == name, 'Borders_URL'] = borders_url
    
df

Processing Академический neighbourhood ...
Processing Алексеевский neighbourhood ...
Processing Алтуфьевский neighbourhood ...
Processing Арбат neighbourhood ...
Processing Аэропорт neighbourhood ...
Processing Бабушкинский neighbourhood ...
Processing Басманный neighbourhood ...
Processing Беговой neighbourhood ...
Processing Бескудниковский neighbourhood ...
Processing Бибирево neighbourhood ...
Processing Бирюлёво Восточное neighbourhood ...
Processing Бирюлёво Западное neighbourhood ...
Processing Богородское neighbourhood ...
Processing Братеево neighbourhood ...
Processing Бутово Северное neighbourhood ...
Processing Бутово Южное neighbourhood ...
Processing Бутырский neighbourhood ...
Processing Вешняки neighbourhood ...
Processing Внуково neighbourhood ...
Processing Войковский neighbourhood ...
Processing Восточный neighbourhood ...
Processing Выхино-Жулебино neighbourhood ...
Processing Гагаринский neighbourhood ...
Processing Головинский neighbourhood ...
Processing Гольянов

Unnamed: 0,Name,Latitude,Longitude,Area,Population,Density,URL,Borders_URL
0,Академический,55.685,37.578,,,,http://mosopen.ru/region/akademicheskij,http://mosopen.ru/public/ymapsml.php?p=region/...
1,Алексеевский,55.811314,37.648773,,,,http://mosopen.ru/region/alekseevskij,http://mosopen.ru/public/ymapsml.php?p=region/...
2,Алтуфьевский,55.8783,37.584,,,,http://mosopen.ru/region/altufevskij,http://mosopen.ru/public/ymapsml.php?p=region/...
3,Арбат,55.75092,37.592189,,,,http://mosopen.ru/region/arbat,http://mosopen.ru/public/ymapsml.php?p=region/...
4,Аэропорт,55.801,37.54,,,,http://mosopen.ru/region/aeroport,http://mosopen.ru/public/ymapsml.php?p=region/...
...,...,...,...,...,...,...,...,...
120,Щукино,55.801,37.471,,,,http://mosopen.ru/region/shchukino,http://mosopen.ru/public/ymapsml.php?p=region/...
121,Южнопортовый,55.715,37.675,,,,http://mosopen.ru/region/yuzhnoportovyj,http://mosopen.ru/public/ymapsml.php?p=region/...
122,Якиманка,55.730832,37.61075,,,,http://mosopen.ru/region/yakimanka,http://mosopen.ru/public/ymapsml.php?p=region/...
123,Ярославский,55.86,37.6925,,,,http://mosopen.ru/region/yaroslavskij,http://mosopen.ru/public/ymapsml.php?p=region/...


#### For each neigbourhoods scrape borders and save to corresponding csv

In [7]:
import lxml

In [8]:
for name, url in list(df[['Name', 'Borders_URL']].itertuples(index=False)):
    print('Processing {} neighbourhood borders ...'.format(name))
    data  = requests.get(url).text 
    soup = BeautifulSoup(data, 'lxml')

    with open(name + ' borders.csv', 'w') as f:
        f.write('Latitude,Longitude\n')
    
        for row in soup.find_all('gml:pos'):
            longitude, latitude = row.text.split(' ')
            f.write('{},{}\n'.format(latitude, longitude))


Processing Академический neighbourhood borders ...
Processing Алексеевский neighbourhood borders ...
Processing Алтуфьевский neighbourhood borders ...
Processing Арбат neighbourhood borders ...
Processing Аэропорт neighbourhood borders ...
Processing Бабушкинский neighbourhood borders ...
Processing Басманный neighbourhood borders ...
Processing Беговой neighbourhood borders ...
Processing Бескудниковский neighbourhood borders ...
Processing Бибирево neighbourhood borders ...
Processing Бирюлёво Восточное neighbourhood borders ...
Processing Бирюлёво Западное neighbourhood borders ...
Processing Богородское neighbourhood borders ...
Processing Братеево neighbourhood borders ...
Processing Бутово Северное neighbourhood borders ...
Processing Бутово Южное neighbourhood borders ...
Processing Бутырский neighbourhood borders ...
Processing Вешняки neighbourhood borders ...
Processing Внуково neighbourhood borders ...
Processing Войковский neighbourhood borders ...
Processing Восточный neig

In [18]:
df.loc[df['Name'] == 'Хорошёво-Мневники', 'Name'] = 'Хорошёво-Мнёвники'
df.loc[df['Name'] == 'Марьина роща', 'Name'] = 'Марьина Роща'
df.loc[df['Name'] == 'Нагатинский затон', 'Name'] = 'Нагатинский Затон'
df.loc[df['Name'] == 'Соколиная гора', 'Name'] = 'Соколиная Гора'
df.loc[df['Name'] == 'Филёвский парк', 'Name'] = 'Филёвский Парк'

#### For each neigbourhood scrap area, population and density from Wikipedia

In [19]:
wikipedia_data  = requests.get(wikipedia_url).text 
soup = BeautifulSoup(wikipedia_data, 'html5lib')

table = soup.find('table')

for row in table.find_all('tr'):
    col = row.find_all('td')
    if (col != []):
        name = col[3].getText().strip()
        x = name.split(' ')
        if len(x) > 1 and (x[0] == 'Северное' or x[0] == 'Южное' or x[0] == 'Восточное' or x[0] == 'Западное'): 
            name = x[1] + ' ' + x[0]
        area = float(col[6].getText().replace(' ', '').replace(',','.'))
        population = int(col[7].getText().replace(' ','').replace(u'\xa0','')[1:])
        density = float(col[8].getText().replace(' ', ''))
        
        print('[{}] [{}] [{}] [{}]'.format(name, area, population, density))

        df.loc[df['Name'] == name, 'Area'] = area
        df.loc[df['Name'] == name, 'Population'] = population
        df.loc[df['Name'] == name, 'Density'] = density

df

[Академический] [5.83] [110459] [18946.66]
[Алексеевский] [5.29] [80449] [15207.75]
[Алтуфьевский] [3.25] [57601] [17723.38]
[Арбат] [2.11] [36107] [17112.32]
[Аэропорт] [4.58] [79413] [17339.08]
[Бабушкинский] [5.07] [88607] [17476.73]
[Басманный] [8.37] [110329] [13181.48]
[Беговой] [5.56] [42634] [7667.99]
[Бескудниковский] [3.3] [79607] [24123.33]
[Бибирево] [6.45] [160248] [24844.65]
[Бирюлёво Восточное] [14.77] [155196] [10507.52]
[Бирюлёво Западное] [8.51] [88573] [10408.11]
[Богородское] [10.24] [110044] [10746.48]
[Братеево] [7.63] [109732] [14381.65]
[Бутырский] [5.04] [71721] [14230.36]
[Вешняки] [10.72] [121972] [11377.99]
[Внуково] [17.42] [25764] [1478.99]
[Войковский] [6.61] [70623] [10684.27]
[Дегунино Восточное] [3.77] [98923] [26239.52]
[Измайлово Восточное] [3.85] [77931] [20241.82]
[Восточный] [3.14] [13698] [4362.42]
[Выхино-Жулебино] [14.97] [224783] [15015.56]
[Гагаринский] [5.5] [81739] [14861.64]
[Головинский] [8.93] [103508] [11591.04]
[Гольяново] [14.99] [162

Unnamed: 0,Name,Latitude,Longitude,Area,Population,Density,URL,Borders_URL
0,Академический,55.685,37.578,5.83,110459,18946.7,http://mosopen.ru/region/akademicheskij,http://mosopen.ru/public/ymapsml.php?p=region/...
1,Алексеевский,55.811314,37.648773,5.29,80449,15207.8,http://mosopen.ru/region/alekseevskij,http://mosopen.ru/public/ymapsml.php?p=region/...
2,Алтуфьевский,55.8783,37.584,3.25,57601,17723.4,http://mosopen.ru/region/altufevskij,http://mosopen.ru/public/ymapsml.php?p=region/...
3,Арбат,55.75092,37.592189,2.11,36107,17112.3,http://mosopen.ru/region/arbat,http://mosopen.ru/public/ymapsml.php?p=region/...
4,Аэропорт,55.801,37.54,4.58,79413,17339.1,http://mosopen.ru/region/aeroport,http://mosopen.ru/public/ymapsml.php?p=region/...
...,...,...,...,...,...,...,...,...
120,Щукино,55.801,37.471,7.69,111207,14461.2,http://mosopen.ru/region/shchukino,http://mosopen.ru/public/ymapsml.php?p=region/...
121,Южнопортовый,55.715,37.675,4.53,74756,16502.4,http://mosopen.ru/region/yuzhnoportovyj,http://mosopen.ru/public/ymapsml.php?p=region/...
122,Якиманка,55.730832,37.61075,4.8,27754,5782.08,http://mosopen.ru/region/yakimanka,http://mosopen.ru/public/ymapsml.php?p=region/...
123,Ярославский,55.86,37.6925,7.99,98134,12282.1,http://mosopen.ru/region/yaroslavskij,http://mosopen.ru/public/ymapsml.php?p=region/...


#### Save dataframe to csv

In [20]:
df.to_csv('moscow_neighbourhoods.csv', index=False)

Unnamed: 0,Name,Latitude,Longitude,Area,Population,Density,URL,Borders_URL
