# Web Scraping

# JSON format

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [3]:
obj = """
{
    "name": "Kim",
    "places_lived": ["Seoul", "Korea"],
    "pet": null, 
    "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}]
}
"""

In [4]:
type(obj)

str

In [5]:
r = json.loads(obj)   # decoding (json --> dict)
type(r)

dict

In [6]:
json.dumps(r)    # encoding (dict --> json)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

In [7]:
# Exercise 2 (from https://rfriend.tistory.com/474)
py_data = {

    "1.FirstName": "Gildong",
    "2.LastName": "Hong",
    "3.Age": 20,
    "4.University": "Yonsei University",
    "5.Courses": [
        {
            "Classes": [
                "Probability",
                "Generalized Linear Model",
                "Categorical Data Analysis"
            ],
            "Major": "Statistics"
        },
        {
            "Classes": [
                "Data Structure",
                "Programming",
                "Algorithms"
            ],
            "Minor": "ComputerScience"
        }
    ]
}

In [8]:
type(py_data)

dict

In [9]:
import json
json_str = json.dumps(py_data)
print(type(json_str))
json_str

<class 'str'>


'{"1.FirstName": "Gildong", "2.LastName": "Hong", "3.Age": 20, "4.University": "Yonsei University", "5.Courses": [{"Classes": ["Probability", "Generalized Linear Model", "Categorical Data Analysis"], "Major": "Statistics"}, {"Classes": ["Data Structure", "Programming", "Algorithms"], "Minor": "ComputerScience"}]}'

In [10]:
pd.json_normalize(py_data)

Unnamed: 0,1.FirstName,2.LastName,3.Age,4.University,5.Courses
0,Gildong,Hong,20,Yonsei University,"[{'Classes': ['Probability', 'Generalized Line..."


In [11]:
pd.json_normalize(py_data, "5.Courses")

Unnamed: 0,Classes,Major,Minor
0,"[Probability, Generalized Linear Model, Catego...",Statistics,
1,"[Data Structure, Programming, Algorithms]",,ComputerScience


In [12]:
# JSON exercise3
# from https://pandas.pydata.org/pandas-docs/stable/reference/api/\
#              pandas.io.json.json_normalize.html
data = [{'state': 'Florida', 
         'shortname': 'FL', 
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [13]:
type(data), len(data)

(list, 2)

In [14]:
pd.json_normalize(data)

Unnamed: 0,state,shortname,counties,info.governor
0,Florida,FL,"[{'name': 'Dade', 'population': 12345}, {'name...",Rick Scott
1,Ohio,OH,"[{'name': 'Summit', 'population': 1234}, {'nam...",John Kasich


In [15]:
pd.json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [16]:
pd.json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


# HTML Parsing
- before you do this example, try to see and run some example HTML files which are in this directory

In [17]:
# !pip install bs4

In [18]:
from bs4 import BeautifulSoup

In [19]:
html_text = """
<html>
<body>
  <h1> reading web page with python </h1>
     <p> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [20]:
soup = BeautifulSoup(html_text, 'html.parser')
soup


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [21]:
soup.h1.text.strip()

'reading web page with python'

In [22]:
soup.p

<p> page analysis </p>

In [23]:
soup.p.next_sibling.next_sibling

<p> page alignment </p>

In [24]:
soup.td.next_sibling.next_sibling

<td><p>more text</p></td>

In [25]:
print(soup.td.next_sibling, soup.td.next_sibling.string)

<td></td> None


In [26]:
html_text2 = """
<html>
<body>
  <h1 id="title"> reading web page with python </h1>
     <p id="body"> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
     <ul>
         <li><a href = "http://www.naver.com"> naver</a></li>
         <li><a href = "http://www.daum.net"> daum</a></li>
     </ul>
  <div id="xxx">
    <h1> Wiki-books store </h1>
    <ul class="item">
      <li> introduction to game design </li>
      <li> introduction to python </li>
      <li> introduction to web design </li>
    </ul>
  </div>
</body>
</html>
"""

In [27]:
soup = BeautifulSoup(html_text2, 'html.parser')

### access by tags

In [28]:
soup.find(id='title')

<h1 id="title"> reading web page with python </h1>

In [29]:
soup.find(id='body').string

' page analysis '

In [30]:
soup.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [31]:
soup.find_all('li')[0]

<li><a href="http://www.naver.com"> naver</a></li>

In [32]:
soup.find_all('li')[0].string, soup.find_all('li')[0].attrs

(' naver', {})

In [33]:
soup.find_all('a')[0]

<a href="http://www.naver.com"> naver</a>

In [34]:
soup.find_all('a')[0].string, soup.find_all('a')[0].attrs

(' naver', {'href': 'http://www.naver.com'})

In [35]:
for aa in soup.find_all('a'):
    href = aa.attrs['href']
    text = aa.string
    print (text, "-->", href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


### access by regular expression

In [36]:
import re
soup.find_all(re.compile("^p"))   # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [37]:
soup.find_all(re.compile("div" ))

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [38]:
soup.find_all(href=re.compile("^http://"))

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

### access by css (Cascading Style Sheets) selector

In [39]:
soup.select('h1')    # by tags

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [40]:
soup.select('#xxx')  # by id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [41]:
soup.select('.item') # by class name

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [42]:
soup.select('div .item')  # multi-components(tag=div, class=item)

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [43]:
soup.select_one("#xxx > ul > li")  # hierarchy (child)

<li> introduction to game design </li>

In [44]:
soup.select("div li")   # hierarchy (div tag >>> ul tag) (descendants)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [45]:
text = '<p class="body strikeout"></p>'

css_soup = BeautifulSoup(text, 'html.parser')
css_soup.find_all("p", class_="strikeout")  # can have multiple values for a class

[<p class="body strikeout"></p>]

In [46]:
css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [47]:
# If you want to search for tags that match two or more CSS classes, 
# you should use a CSS selector:
css_soup.select("p.body.strikeout") 

[<p class="body strikeout"></p>]

## Example from Indeed
- newly added in 2021.7.15
- kr.indeed.com (search for 'data science' in '서울특별시')

In [48]:
import requests
from bs4 import BeautifulSoup

# Create a soup object
url = 'https://kr.indeed.com/jobs?q=data+science&l=%EC%84%9C%EC%9A%B8%ED%8A%B9%EB%B3%84%EC%8B%9C'
link = requests.get(url)
soup = BeautifulSoup(link.text, 'html.parser')

In [49]:
job_elems = soup.select('.resultContent') # class
len(job_elems)

15

In [50]:
print(job_elems[0])

<td class="resultContent"><div class="heading4 color-text-primary singleLineTitle tapItem-gutter"><h2 class="jobTitle jobTitle-color-purple jobTitle-newJob"><div class="new topLeft holisticNewBlue desktop"><span class="label">new</span></div><span title="[Janssen] Sr. Medical Science Liaison - PAH">[Janssen] Sr. Medical Science Liaison - PAH</span></h2></div><div class="heading6 company_location tapItem-gutter"><pre><span class="companyName"><a class="turnstileLink companyOverviewLink" data-tn-element="companyName" href="/cmp/Johnson-&amp;-Johnson" rel="noopener" target="_blank">Johnson &amp; Johnson Family of Companies</a></span><span class="ratingsDisplay withRatingLink"><a class="ratingLink" data-tn-variant="cmplinktst2" href="/cmp/Johnson-&amp;-Johnson/reviews" rel="noopener" target="_blank" title="Johnson &amp; Johnson Family of Companies reviews"><span aria-label="4.2 of stars rating" class="ratingNumber" role="img"><span aria-hidden="true">4.2</span><svg aria-hidden="true" class

In [51]:
job_elems[0].find('h2', class_='jobTitle').text.strip()

'new[Janssen] Sr. Medical Science Liaison - PAH'

In [52]:

job_elems[0].find('h2').text.strip()

'new[Janssen] Sr. Medical Science Liaison - PAH'

In [53]:
job_elems[0].find('span', class_='companyName').text.strip()

'Johnson & Johnson Family of Companies'

In [54]:
job_elems[0].find('div', class_='companyLocation').text.strip()

'서울'

In [55]:
for i in job_elems:
    # title = i.find('h2', class_='jobTitle')
    title = i.find('h2')
    company = i.find('span', class_='companyName')
    location = i.find('div', class_='companyLocation')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
    print()

new[Janssen] Sr. Medical Science Liaison - PAH
Johnson & Johnson Family of Companies
서울

2022년 5급 신입행원(석·박사)
KDB산업은행
서울

new2021년 9월 경력사원 (DX)
LG에너지솔루션
서울 영등포구

Data Scientist
Boston Consulting Group
서울

AI/빅데이터/클라우드 분야 (인재 Pool)
LG유플러스
서울

Data Analyst
IQVIA
서울

newCamera HW Architecture
Qualcomm Korea YH
서울

데이터사이언티스트 경력사원
LG전자
서울 영등포구

Data Scientist
IBM
서울

Developer / Designer recruit
마이뮤직테이스트
서울 논현동

Admin Assistant
IQVIA
서울

Specialist _ Data & AI
Microsoft
서울

Data Team Lead
IQVIA
서울

Data Analyst
IBM
서울

Application Scientist
Oxford Instruments Plc
서울



In [56]:
# put altogether
url = 'https://kr.indeed.com/jobs?q=data+science&l=%EC%84%9C%EC%9A%B8%ED%8A%B9%EB%B3%84%EC%8B%9C'
link = requests.get(url)
soup = BeautifulSoup(link.text, 'html.parser')

job_elems = soup.select('.resultContent') # class

for i in job_elems:
    title = i.find('h2')
    company = i.find('span', class_='companyName')
    location = i.find('div', class_='companyLocation')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())

new[Janssen] Sr. Medical Science Liaison - PAH
Johnson & Johnson Family of Companies
서울
2022년 5급 신입행원(석·박사)
KDB산업은행
서울
new2021년 9월 경력사원 (DX)
LG에너지솔루션
서울 영등포구
Data Scientist
Boston Consulting Group
서울
AI/빅데이터/클라우드 분야 (인재 Pool)
LG유플러스
서울
Data Analyst
IQVIA
서울
newCamera HW Architecture
Qualcomm Korea YH
서울
데이터사이언티스트 경력사원
LG전자
서울 영등포구
Data Scientist
IBM
서울
Developer / Designer recruit
마이뮤직테이스트
서울 논현동
Admin Assistant
IQVIA
서울
Specialist _ Data & AI
Microsoft
서울
Data Team Lead
IQVIA
서울
Data Analyst
IBM
서울
Application Scientist
Oxford Instruments Plc
서울
