In [76]:
import requests
import json
import urllib.parse
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import re
import pandas as pd
import sys
!{sys.executable} -m pip install folium
import os 
import folium



In [12]:
def getSoup(link):
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

soup = getSoup("https://www.skupina.coop/cooperative/list/")

In [15]:
def getAllLinks(link):
    soup = getSoup(link)
    h2s = soup.findAll('h2', {'class':'cooperative'})
    return ['https://www.skupina.coop' + h2.find('a')['href'] for h2 in h2s]

links = getAllLinks('https://www.skupina.coop/cooperative/list/')
print(links)

['https://www.skupina.coop/jednota-benesov-druzstvo', 'https://www.skupina.coop/coop-horovice-druzstvo', 'https://www.skupina.coop/druzstvo-jednota-kladno', 'https://www.skupina.coop/coop-mnichovo-hradiste-druzstvo', 'https://www.skupina.coop/jednota-spotrebni-druzstvo-v-nymburce', 'https://www.skupina.coop/coop-praha-zapad', 'https://www.skupina.coop/coop-pribram-druzstvo', 'https://www.skupina.coop/jednota-spotrebni-druzstvo-v-rakovniku', 'https://www.skupina.coop/coop-druzstvo-plasy', 'https://www.skupina.coop/zapadoceske-konzumni-druzstvo-plzen', 'https://www.skupina.coop/jednota-konzumni-druzstvo-rokycany', 'https://www.skupina.coop/zapadoceske-konzumni-druzstvo-susice', 'https://www.skupina.coop/jednota-spotrebni-druzstvo-v-tachove', 'https://www.skupina.coop/coop-jednota-touzim-spotrebni-druzstvo', 'https://www.skupina.coop/spotrebni-druzstvo-jednota-kadan', 'https://www.skupina.coop/druzstvo-jednota-most', 'https://www.skupina.coop/jednota-spotrebni-druzstvo-v-podboranech', 'ht

In [16]:
def getAllSublinks(link):
    list = []
    links = getAllLinks(link)
    for link in links:
        sleep(0.1)
        soup = getSoup(link)
        aas = soup.findAll('a', text = re.compile("detail"))      
        list.extend(['https://www.skupina.coop' + a['href'] for a in aas])
    return list

sublinks = getAllSublinks('https://www.skupina.coop/cooperative/list/')
len(sublinks)

2502

In [17]:
def parse_name(soup):
    return "COOP " + soup.find('h1').find("span").text

parse_name(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))

'COOP Vlašimská 7, Louňovice pod Blaníkem'

In [18]:
def parse_address(soup):
    address = soup.find('h2', text = re.compile("ADRESA")).next_sibling.next_sibling.get_text(separator = ", ").replace("\n","").replace("\t","")
    return address[:(len(address)-2)]
    
parse_address(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))

'Vlašimská 7, 257 06 Louňovice pod Blaníkem, Benešov'

In [19]:
def parse_druzstvo(soup):
    return soup.find('h2', text = re.compile("Družstvo")).next_sibling.next_sibling.text.strip()
    
parse_druzstvo(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))

'JEDNOTA Benešov, družstvo'

In [20]:
def parse_druzstvo_link(soup):
    return 'https://www.skupina.coop' + soup.find('h2', text = re.compile("Družstvo")).next_sibling.next_sibling.find("a")["href"]
    
parse_druzstvo_link(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))

'https://www.skupina.coop/jednota-benesov-druzstvo'

In [21]:
def parse_phone(soup):
    try:
        return soup.find('div',{'class':'kontakty'}).find('li',{'class':'telefon'}).text.strip()
    except:
        return "-"

parse_phone(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))

'317 852 650'

In [22]:
def parse_link(soup):
    try:
        return soup.find('div',{'class':'kontakty'}).find('li',{'class':'link'}).text.strip()
    except:
        return "-"


#parse_link(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))
parse_link(getSoup("https://www.skupina.coop/coop-sobechleby-129-sobechleby-617"))

'www.jednotaostroh.cz'

In [23]:
def parse_mail(soup):
    try:
        return soup.find('div',{'class':'kontakty'}).find('li',{'class':'mail'}).text.strip()
    except:
        return "-"

parse_mail(getSoup(parse_druzstvo_link(getSoup("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254"))))

'-'

In [25]:
def parse_coop_shops(link,pause=.5)

    sleep(pause)

    soup = getSoup(link)
    soup_dr = getSoup(parse_druzstvo_link(getSoup(link)))
    gps = get_GPS(soup)

    return pd.Series({
        'name':parse_name(soup),
        'address':parse_address(soup),
        'phone':parse_phone(soup),
        'mail':parse_mail(soup),
        'webpage':parse_link(soup),
        'druzstvo':parse_druzstvo(soup),
       # 'address_dr':parse_address(soup_dr),
        'phone_dr':parse_phone(soup_dr),
        'mail_dr':parse_mail(soup_dr),
        'webpage_dr':parse_link(soup_dr),
        'lat':gps[0],
        'lng':gps[1]
    })

#parse_coop_shops("https://www.skupina.coop/coop-vlasimska-7-lounovice-pod-blanikem-254")

In [26]:
def get_all_shops(link):
    '''
    Because of API requests limitation, do not use for all sublinks.
    '''
    links = getAllSublinks(link)[:2]

    return pd.DataFrame([parse_coop_shops(link) for link in links])

shops = get_all_shops('https://www.skupina.coop/cooperative/list/')
print(shops)

                                       name  \
0  COOP Vlašimská 7, Louňovice pod Blaníkem   
1                   COOP Husova 471, Votice   

                                             address        phone mail  \
0  Vlašimská 7, 257 06 Louňovice pod Blaníkem, Be...  317 852 650    -   
1                 Husova 471, 259 01 Votice, Benešov  317 812 530    -   

  webpage                   druzstvo     phone_dr mail_dr webpage_dr  \
0       -  JEDNOTA Benešov, družstvo  317 719 111       -          -   
1       -  JEDNOTA Benešov, družstvo  317 719 111       -          -   

         lat        lng  
0  49.638473  14.847524  
1  49.638246  14.636691  


In [24]:
def get_GPS(soup):
        '''
    Only 10 000 requests are free with this API code. Use wisely.
    '''
    key = "U4Bdq1mMc6cN6d6RAGSJpVA0GdKGVIHk"
    adr = urllib.parse.quote_plus(parse_address(soup))
    r = requests.get("https://www.mapquestapi.com/geocoding/v1/address?key={}&inFormat=kvp&outFormat=json&location={}&thumbMaps=false".format(key,adr))
    d = r.json()
    return [d["results"][0]["locations"][0]["latLng"]["lat"],d["results"][0]["locations"][0]["latLng"]["lng"]]

In [141]:
def plotMap (df):
    Mlat = (df["lat"].max()-df["lat"].min())/2+df["lat"].min()
    Mlng = (df["lng"].max()-df["lng"].min())/2+df["lng"].min()
    m = folium.Map(location = [Mlat,Mlng])
    for k in df.index:
        html = '''<meta charset="UTF-8">
                <b>{}</b><br>
                Phone: {}<br>
                Mail: {}'''.format(df["name"][k],df["phone"][k],df["mail"][k])
        iframe = folium.IFrame(html,width=250,height=88)
        folium.Marker(
        [df["lat"][k], df["lng"][k]],
        popup = folium.Popup(iframe,max_width=250),
        tooltip = "Click!",
        ).add_to(m)   
    return m

In [142]:
plotMap(shops)

In [37]:
(shops["lng"].max()-shops["lng"].min()/2)+shops["lng"].min()


22.1658695

In [61]:
shops["lat"].max()

49.638473

In [79]:
print(sys.stdin.encoding)

cp1250
