### requests & bs4 101

__Ducomentations__:
 - [requests](https://requests.readthedocs.io/en/master/)
 - [bs4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
 
__Installation__:
> pip install requests

> pip install bs4

or combined
> pip install requests, bs4
 
__Quick start__:
```python
import requests
from bs4 import BeautifulSoup as bs

resp = requests.get('https://www.imdb.com/chart/top/')
soup = bs(resp.text)

films_list = soup.find('tbody', {'class': 'lister-list'})
films = films_list.find_all('tr')
```

__BeautifulSoup object main methods__:
```python
soup.find(tag, {attr: value})
soup.find_all(tag, {attr: value})
```
basically that is it

for more BeautifulSoup object methods see
```python
dir(soup)
```

### Task 1

In [38]:
import requests
from bs4 import BeautifulSoup as bs

resp = requests.get('https://www.imdb.com/chart/top/')
soup = bs(resp.text)

films_set = soup.find('tbody', {'class': 'lister-list'}).find_all('tr')
films = [{
    'rank': int(
        film.find_all('td')[1].text.split('\n')[1].strip('.').strip()
    ),
    'name': film.find_all('td')[1].text.split('\n')[2].strip(),
    'year': int(
        film.find_all('td')[1].text.split('\n')[3].strip('()')
    ),
    'rating': float(
        film.find_all('td')[2].text.strip()
    )
 } for film in films_set]

films[0]

{'rank': 1, 'name': 'Побег из Шоушенка', 'year': 1994, 'rating': 9.2}

### Task 2

In [39]:
# oldest film
sorted(films, key=lambda x: x['year'])[0]

{'rank': 101, 'name': 'Малыш', 'year': 1921, 'rating': 8.2}

In [40]:
# newest film
sorted(films, key=lambda x: x['year'])[1]

{'rank': 198, 'name': 'Шерлок младший', 'year': 1924, 'rating': 8.1}

In [41]:
# average age of films by bins of 50
film_groups = [films[i:i+50] for i in range(len(films)//50)]
average_by_group = []
for group in film_groups:
    years = [film['year'] for film in group]
    average_by_group.append(
        sum(years)/len(years))
    
average_by_group

[1988.14, 1988.02, 1987.66, 1987.76, 1987.18]

In [47]:
for i, film in enumerate(films_set):
    url = 'https://imdb.com' + film.find('td').find('a').attrs['href']
    films[i].update({'url': url})
    
films[0]

{'rank': 1,
 'name': 'Побег из Шоушенка',
 'year': 1994,
 'rating': 9.2,
 'url': 'https://imdb.com/title/tt0111161/'}

### Task 3

In [54]:
# try for one
i = 0
film = films[i]

resp = requests.get(film['url'])
soup = bs(resp.text)

genre = soup.find('div', {'class': 'subtext'}).find('a').text

'Drama'

In [57]:
# extrapolate for all
for i, film in enumerate(films):
    resp = requests.get(film['url'])
    soup = bs(resp.text)

    genre = soup.find('div', {'class': 'subtext'}).find('a').text
    film.update({'genre': genre})
    
    # takes too long, whant to know the progress
    # so break here, tweak it below
    break

In [60]:
# a little tweak 
for i, film in enumerate(films):
    resp = requests.get(film['url'])
    soup = bs(resp.text)

    genre = soup.find('div', {'class': 'subtext'}).find('a').text
    film.update({'genre': genre})
    
    print('Films scraped:', i+1, end='\r')
    
    # takes too long
    # tweak it even more below to find why
    break

Films scraped: 1

In [74]:
import time

time.time()

1604632426.52467

In [76]:
# more tweaks
for i, film in enumerate(films):
    t = time.time()
    resp = requests.get(film['url'])
    print('getting response time:', time.time() - t)
    
    t = time.time()
    soup = bs(resp.text)
    print('creating soup time:', time.time() - t)
    
    t = time.time()    
    genre = soup.find('div', {'class': 'subtext'}).find('a').text
    film.update({'genre': genre})
    print('finding genre time:', time.time() - t)
    
    break

getting response time: 1.8876850605010986
creating soup time: 0.16862916946411133
finding genre time: 0.004058122634887695


In [78]:
# so the worst thing here is getting response from the server
# nothing special to do about it, so leave it as it is

In [60]:
for i, film in enumerate(films):
    resp = requests.get(film['url'])
    soup = bs(resp.text)

    genre = soup.find('div', {'class': 'subtext'}).find('a').text
    film.update({'genre': genre})
    
    print('Films scraped:', i+1, end='\r')

Films scraped: 1

### Task 4

In [88]:
resp = requests.get('https://ru.wikipedia.org/wiki/%D0%93%D0%BE%D0%B3%D0%B8%D1%8F,_%D0%95%D0%BB%D0%B5%D0%BD%D0%B0_%D0%9E%D1%82%D0%B0%D1%80%D0%B8%D0%B5%D0%B2%D0%BD%D0%B0')
soup = bs(resp.text)

a_tags = soup.find_all('a')
links = []
for tag in a_tags:
    if 'href' in tag.attrs:
        link = tag.attrs['href']

        links.append(tag.attrs['href'])

In [91]:
def find_wiki_links(url):
    resp = requests.get(url)
    if not resp.status_code == 200:
        return 'error accessing website'
    soup = bs(resp.text)
    
    a_tags = soup.find_all('a')
    links = []
    for tag in a_tags:
        if 'href' in tag.attrs:
            link = tag.attrs['href']
            if link.startswith('/wiki/'):
                links.append(tag.attrs['href'])
    
    return links

### Bonus
__working with urls__

We will need a library called `urllib`

It should be preinstalled

In [2]:
from urllib.parse import urlparse, parse_qs, urlunparse, urlencode

In [8]:
url = 'https://cacs.econ.msu.ru/index.php?mnu=75?&selst=20717'
parse = urlparse(url)

parse

ParseResult(scheme='https', netloc='cacs.econ.msu.ru', path='/index.php', params='', query='mnu=75?&selst=20717', fragment='')

In [9]:
# parse query of url into python dictionary
parse_qs(parse.query)

{'mnu': ['75?'], 'selst': ['20717']}

In [10]:
# unparse a dictionary into query
my_query = {'mnu': '75', 'selst': '20716'}
urlencode(my_query)

'mnu=75&selst=20716'

In [14]:
# build custom url based on params
my_id = '20717'

urlunparse([
    'https',
    'cacs.econ.msu.ru',
    '/index.php',
    '',
    urlencode({
        'mnu': '75',
        'selst': my_id
    }),
    ''
])

'https://cacs.econ.msu.ru/index.php?mnu=75&selst=20717'

In [15]:
# now we can make a handy function for that
def build_url(student_id):
    return urlunparse([
        'https',
        'cacs.econ.msu.ru',
        '/index.php',
        '',
        urlencode({
            'mnu': '75',
            'selst': student_id
        }),
        ''
    ])

In [17]:
id_list = [20717, 20718, 20719]

for student_id in id_list:
    url = build_url(student_id)
    print(url)

https://cacs.econ.msu.ru/index.php?mnu=75&selst=20717
https://cacs.econ.msu.ru/index.php?mnu=75&selst=20718
https://cacs.econ.msu.ru/index.php?mnu=75&selst=20719
