# Collecting data from internet
---

Web scraping consists of 

- obtaining raw data from source ( requests )
- filtering out useful data from garbage ( beautifulsoup4 )
- saving into appropriate format ( csv/json )

`scrapy` is scrapping framework which extends above libraries

In [1]:
# requests is http library for python
# requests makes http easier than builtin urllib
import requests

In [2]:
url = 'http://www.mfd.gov.np'

In [3]:
req = requests.get(url)

In [4]:
req

<Response [200]>

In [5]:
dir(req)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [6]:
req.headers

{'Connection': 'close', 'Date': 'Wed, 22 Jun 2016 01:50:03 GMT', 'Set-Cookie': 'PHPSESSID=o1pnh3ebttfbb8d889rldcoe01; path=/', 'Cache-Control': 'no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'X-Powered-By': 'PHP/5.3.3', 'Pragma': 'no-cache', 'Content-Type': 'text/html; charset=utf-8', 'Expires': 'Thu, 19 Nov 1981 08:52:00 GMT', 'Transfer-Encoding': 'chunked', 'Server': 'Apache/2.2.15 (CentOS)'}

In [7]:
req.status_code

200

In [8]:
req.text



In [9]:
from bs4 import BeautifulSoup

In [10]:
soup = BeautifulSoup(req.text, "html.parser")

*you can use `lxml` instead of `html.parser` which is must faster for large html content*

In [11]:
soup

<!DOCTYPE html>

<html>
<head>
<title>Meteorological Forecasting Division</title>
<meta content="width=device-width, initial-scale=1.0, user-scalable=no" name="viewport">
<meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
<link href="/mfd.gov.np/_bootstrap/css/bootstrap.css" rel="stylesheet">
<link href="/mfd.gov.np/_stylesheets/mfd.css" rel="stylesheet">
<link href="/mfd.gov.np/_req/favicon.png" rel="icon" type="image/png">
<!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
	<script src="/mfd.gov.np/_js/html5shiv.js"></script>
	<script src="/mfd.gov.np/_js/respond.min.js"></script>


	<![endif]-->
</link></link></link></meta></head>
<body>
<div class="site-header">
<div class="container">
<img src="/mfd.gov.np/_req/logo.png" width="64px"/>
<h2 style="margin-bottom: 5px">Government of Nepal</h2>
<h2>Department of Hydrology and Meteorology</h2>
<h1>Meteorological Forecasting Division</h1>
</div>
<div class="container">
<

In [12]:
type(soup)

bs4.BeautifulSoup

In [13]:
dir(soup)

['ASCII_SPACES',
 'DEFAULT_BUILDER_FEATURES',
 'HTML_FORMATTERS',
 'ROOT_TAG_NAME',
 'XML_FORMATTERS',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_attr_value_as_string',
 '_attribute_checker',
 '_feed',
 '_find_all',
 '_find_one',
 '_formatter_for_name',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_most_recent_element',
 '_popToTag',
 '_select_debug',
 '_selector_combinators',
 '_should_pretty_print',
 '_tag_name_matches_and',
 'append',
 'attribselect_re',
 'attrs',
 'builder',

In [14]:
tables = soup.find_all('table')

In [15]:
tables

[<table class="table" style="margin-bottom: 0px"><tr><th>Station</th><th class="center">Maximum Temp.<br/>(°C)</th><th class="center">Minimum Temp.<br/>(°C)</th><th class="center">24 hrs Rainfall<br/>(mm)</th></tr><tr><td>Dadeldhura</td><td class="center">19.4</td><td class="center">18.1</td><td class="center">13.0</td></tr><tr><td>Dipayal</td><td class="center">28.0</td><td class="center">25.2</td><td class="center">9.6</td></tr><tr><td>Dhangadi</td><td class="center">28.7</td><td class="center">24.0</td><td class="center">88.3</td></tr><tr><td>Birendranagar</td><td class="center">27.2</td><td class="center">23.4</td><td class="center">29.3</td></tr><tr><td>Nepalgunj</td><td class="center">29.7</td><td class="center">25.5</td><td class="center">21.4</td></tr><tr><td>Jumla</td><td class="center">23.7</td><td class="center">16.5</td><td class="center">6.2</td></tr><tr><td>Dang</td><td class="center">29.2</td><td class="center">24.5</td><td class="center">5.4</td></tr><tr><td>Pokhara</td

In [16]:
len(tables)

1

In [17]:
tables[0]

<table class="table" style="margin-bottom: 0px"><tr><th>Station</th><th class="center">Maximum Temp.<br/>(°C)</th><th class="center">Minimum Temp.<br/>(°C)</th><th class="center">24 hrs Rainfall<br/>(mm)</th></tr><tr><td>Dadeldhura</td><td class="center">19.4</td><td class="center">18.1</td><td class="center">13.0</td></tr><tr><td>Dipayal</td><td class="center">28.0</td><td class="center">25.2</td><td class="center">9.6</td></tr><tr><td>Dhangadi</td><td class="center">28.7</td><td class="center">24.0</td><td class="center">88.3</td></tr><tr><td>Birendranagar</td><td class="center">27.2</td><td class="center">23.4</td><td class="center">29.3</td></tr><tr><td>Nepalgunj</td><td class="center">29.7</td><td class="center">25.5</td><td class="center">21.4</td></tr><tr><td>Jumla</td><td class="center">23.7</td><td class="center">16.5</td><td class="center">6.2</td></tr><tr><td>Dang</td><td class="center">29.2</td><td class="center">24.5</td><td class="center">5.4</td></tr><tr><td>Pokhara</td>

In [18]:
tables[0].find_all('tr')

[<tr><th>Station</th><th class="center">Maximum Temp.<br/>(°C)</th><th class="center">Minimum Temp.<br/>(°C)</th><th class="center">24 hrs Rainfall<br/>(mm)</th></tr>,
 <tr><td>Dadeldhura</td><td class="center">19.4</td><td class="center">18.1</td><td class="center">13.0</td></tr>,
 <tr><td>Dipayal</td><td class="center">28.0</td><td class="center">25.2</td><td class="center">9.6</td></tr>,
 <tr><td>Dhangadi</td><td class="center">28.7</td><td class="center">24.0</td><td class="center">88.3</td></tr>,
 <tr><td>Birendranagar</td><td class="center">27.2</td><td class="center">23.4</td><td class="center">29.3</td></tr>,
 <tr><td>Nepalgunj</td><td class="center">29.7</td><td class="center">25.5</td><td class="center">21.4</td></tr>,
 <tr><td>Jumla</td><td class="center">23.7</td><td class="center">16.5</td><td class="center">6.2</td></tr>,
 <tr><td>Dang</td><td class="center">29.2</td><td class="center">24.5</td><td class="center">5.4</td></tr>,
 <tr><td>Pokhara</td><td class="center">25.7

In [34]:
cities = []
headers = []
for row in tables[0].find_all('tr'):
    ths = row.find_all('th')
    if ths:
        headers = [th.text.strip() for th in ths]
    else:
        tds = row.find_all('td')
        data = {}
        if tds and len(tds) >= 4:
            data[headers[0]] = tds[0].text.strip()
            data[headers[1]] = tds[1].text.strip()
            data[headers[2]] = tds[2].text.strip()
            data[headers[3]] = tds[3].text.strip()
        cities.append(data)
print(cities)

[{'Minimum Temp.(°C)': '18.1', '24 hrs Rainfall(mm)': '13.0', 'Maximum Temp.(°C)': '19.4', 'Station': 'Dadeldhura'}, {'Minimum Temp.(°C)': '25.2', '24 hrs Rainfall(mm)': '9.6', 'Maximum Temp.(°C)': '28.0', 'Station': 'Dipayal'}, {'Minimum Temp.(°C)': '24.0', '24 hrs Rainfall(mm)': '88.3', 'Maximum Temp.(°C)': '28.7', 'Station': 'Dhangadi'}, {'Minimum Temp.(°C)': '23.4', '24 hrs Rainfall(mm)': '29.3', 'Maximum Temp.(°C)': '27.2', 'Station': 'Birendranagar'}, {'Minimum Temp.(°C)': '25.5', '24 hrs Rainfall(mm)': '21.4', 'Maximum Temp.(°C)': '29.7', 'Station': 'Nepalgunj'}, {'Minimum Temp.(°C)': '16.5', '24 hrs Rainfall(mm)': '6.2', 'Maximum Temp.(°C)': '23.7', 'Station': 'Jumla'}, {'Minimum Temp.(°C)': '24.5', '24 hrs Rainfall(mm)': '5.4', 'Maximum Temp.(°C)': '29.2', 'Station': 'Dang'}, {'Minimum Temp.(°C)': '22.4', '24 hrs Rainfall(mm)': '24.5', 'Maximum Temp.(°C)': '25.7', 'Station': 'Pokhara'}, {'Minimum Temp.(°C)': '27.5', '24 hrs Rainfall(mm)': '73.5', 'Maximum Temp.(°C)': '27.6', '

**Alternative Method**

*in case of multiple tables within webpage, we can use css selectors*

In [35]:
div = soup.find('div', attrs={'class': 'weather-data-table'})

In [36]:
div

<div class="highlight-box weather-data-table"><h3>Observations <em>2016-06-21 17:45 NPT</em></h3><table class="table" style="margin-bottom: 0px"><tr><th>Station</th><th class="center">Maximum Temp.<br/>(°C)</th><th class="center">Minimum Temp.<br/>(°C)</th><th class="center">24 hrs Rainfall<br/>(mm)</th></tr><tr><td>Dadeldhura</td><td class="center">19.4</td><td class="center">18.1</td><td class="center">13.0</td></tr><tr><td>Dipayal</td><td class="center">28.0</td><td class="center">25.2</td><td class="center">9.6</td></tr><tr><td>Dhangadi</td><td class="center">28.7</td><td class="center">24.0</td><td class="center">88.3</td></tr><tr><td>Birendranagar</td><td class="center">27.2</td><td class="center">23.4</td><td class="center">29.3</td></tr><tr><td>Nepalgunj</td><td class="center">29.7</td><td class="center">25.5</td><td class="center">21.4</td></tr><tr><td>Jumla</td><td class="center">23.7</td><td class="center">16.5</td><td class="center">6.2</td></tr><tr><td>Dang</td><td class="

In [37]:
table = div.find('table')

In [38]:
# first_table = tables[0]

In [39]:
table.find_all('th', attrs={'class': 'center'})

[<th class="center">Maximum Temp.<br/>(°C)</th>,
 <th class="center">Minimum Temp.<br/>(°C)</th>,
 <th class="center">24 hrs Rainfall<br/>(mm)</th>]

In [40]:
data_set = []
for tr in table.find_all('tr'):
    _data = {}
    tds = tr.find_all('td')
    if tds and len(tds) > 3:
        # _data['Station'] = t
        # print(tds)
        _data['Station'] = tds[0].string
        _data['Maximum'] = tds[1].string
        _data['Minimum'] = tds[2].string
        _data['Rainfall'] = tds[3].string
        data_set.append(_data)
print(data_set)

[{'Rainfall': '13.0', 'Maximum': '19.4', 'Station': 'Dadeldhura', 'Minimum': '18.1'}, {'Rainfall': '9.6', 'Maximum': '28.0', 'Station': 'Dipayal', 'Minimum': '25.2'}, {'Rainfall': '88.3', 'Maximum': '28.7', 'Station': 'Dhangadi', 'Minimum': '24.0'}, {'Rainfall': '29.3', 'Maximum': '27.2', 'Station': 'Birendranagar', 'Minimum': '23.4'}, {'Rainfall': '21.4', 'Maximum': '29.7', 'Station': 'Nepalgunj', 'Minimum': '25.5'}, {'Rainfall': '6.2', 'Maximum': '23.7', 'Station': 'Jumla', 'Minimum': '16.5'}, {'Rainfall': '5.4', 'Maximum': '29.2', 'Station': 'Dang', 'Minimum': '24.5'}, {'Rainfall': '24.5', 'Maximum': '25.7', 'Station': 'Pokhara', 'Minimum': '22.4'}, {'Rainfall': '73.5', 'Maximum': '27.6', 'Station': 'Bhairahawa', 'Minimum': '27.5'}, {'Rainfall': '29.2', 'Maximum': '31.8', 'Station': 'Simara', 'Minimum': '27.0'}, {'Rainfall': '39.7', 'Maximum': '26.5', 'Station': 'Kathmandu', 'Minimum': '20.2'}, {'Rainfall': '21.4', 'Maximum': '24.5', 'Station': 'Okhaldhunga', 'Minimum': '18.0'}, {'R

In [42]:
data_set[0].keys()

dict_keys(['Rainfall', 'Maximum', 'Station', 'Minimum'])

*writing to csv file*

In [41]:
import csv

In [43]:
with open('dataset.csv', 'w') as csvfile:
    csvdoc = csv.DictWriter(csvfile, 
                            fieldnames=data_set[0].keys())
    csvdoc.writeheader()
    csvdoc.writerows(data_set)

In [44]:
data_set[0]

{'Maximum': '19.4',
 'Minimum': '18.1',
 'Rainfall': '13.0',
 'Station': 'Dadeldhura'}

In [45]:
data_set[0].keys()

dict_keys(['Rainfall', 'Maximum', 'Station', 'Minimum'])

*json output*

In [46]:
import json

In [49]:
json.dump(data_set, open('dataset.json', 'w'))

In [50]:
json.dumps(data_set)

'[{"Rainfall": "13.0", "Maximum": "19.4", "Station": "Dadeldhura", "Minimum": "18.1"}, {"Rainfall": "9.6", "Maximum": "28.0", "Station": "Dipayal", "Minimum": "25.2"}, {"Rainfall": "88.3", "Maximum": "28.7", "Station": "Dhangadi", "Minimum": "24.0"}, {"Rainfall": "29.3", "Maximum": "27.2", "Station": "Birendranagar", "Minimum": "23.4"}, {"Rainfall": "21.4", "Maximum": "29.7", "Station": "Nepalgunj", "Minimum": "25.5"}, {"Rainfall": "6.2", "Maximum": "23.7", "Station": "Jumla", "Minimum": "16.5"}, {"Rainfall": "5.4", "Maximum": "29.2", "Station": "Dang", "Minimum": "24.5"}, {"Rainfall": "24.5", "Maximum": "25.7", "Station": "Pokhara", "Minimum": "22.4"}, {"Rainfall": "73.5", "Maximum": "27.6", "Station": "Bhairahawa", "Minimum": "27.5"}, {"Rainfall": "29.2", "Maximum": "31.8", "Station": "Simara", "Minimum": "27.0"}, {"Rainfall": "39.7", "Maximum": "26.5", "Station": "Kathmandu", "Minimum": "20.2"}, {"Rainfall": "21.4", "Maximum": "24.5", "Station": "Okhaldhunga", "Minimum": "18.0"}, {"

**Practice ** *Obtain some data from any of website available*