# Web Scraping with Beautiful Soup

Crawling job listings on indeed.com

In [None]:
from bs4 import BeautifulSoup
import urllib
import re
import pandas as pd

### 1. Find Job Listing Link

Mobile version of indeed.com has simpler html so I will specifically crawl it. https://www.indeed.com/m/

In [None]:
from urllib.request import urlopen
url = "https://www.indeed.com/m/jobs?q=data+scientist&l=Los+Angeles%2C+CA"
page = urlopen(url)
soup = BeautifulSoup(page, 'lxml')

all_matches = soup.find_all('a', attrs={'rel':['nofollow']})
for i in all_matches:
    print (i['href'])
    print (type(i['href']))
    print ("https://www.indeed.com/m/"+i['href'])

### 2. Scrape job title, company, location and job description for each job


In [None]:
# quick example
example_html= \
'''
<html>
	<body>
		<p>
			<b>
				<font size="+1">Data Scientist - Machine Learning</font>
			</b>
			<br>Google - <span class="location">San Jose, CA</span>
		</p>
	</body>
</html>
'''

In [None]:
bs = BeautifulSoup(test_html,'lxml')
# print Job Title
print(bs.body.p.b.font.text)

In [None]:
# print Location
print(bs.body.p.span.text)

In [None]:
# Scraping real jobs from indeed
title = []
company = []
location = []
jd = []
for each in all_matches:
    jd_url= 'http://www.indeed.com/m/'+each['href']
    jd_page = urlopen(jd_url)
    jd_soup = BeautifulSoup(jd_page, 'lxml')
    jd_desc = jd_soup.findAll('div',attrs={'id':['desc']}) ## find the structure like: <div id="desc"></>
#    break
    title.append(jd_soup.body.p.b.font.text)
    company.append(jd_desc[0].span.text)
    location.append(jd_soup.body.p.span.text)
    jd.append(jd_desc[0].text

In [None]:
# Description
print(jd_desc[0].text)

In [None]:
# Title 
print(jd_soup.body.p.b.font.text)

In [None]:
# Company
print(jd_desc[0].span.text)
print(jd_soup.body.p.span.previous_sibling.split('-')[0][1:])

### 3. Save data to DataFrame

In [None]:
job = {'title': title,
         'company': company,
         'location': location,
         'Job Description': jd}
df = pd.DataFrame.from_dict(job)
df

### 4. Move Through Pages

In [None]:
title = []
company = []
location = []
jd = []
url = "https://www.indeed.com/m/jobs?q=data+scientist&l=Los+Angeles%2C+CA"
for i in range(2): # search to page 2
    
    page = urlopen(url)
    soup = BeautifulSoup(page, 'lxml')
    all_matches = soup.findAll(attrs={'rel':['nofollow']})
    for each in all_matches:
        jd_url= 'http://www.indeed.com/m/'+each['href']
        jd_page =urlopen(jd_url)
        jd_soup = BeautifulSoup(jd_page, 'lxml')
        jd_desc = jd_soup.findAll(attrs={'id':['desc']})
        title.append(jd_soup.body.p.b.font.text)
        company.append(jd_desc[0].span.text)
        location.append(jd_soup.body.p.span.text)
        jd.append(jd_desc[0].text)
        
    ## Change the pages to Next Page
    url_all = soup.findAll(attrs={'rel':['next']})
    url = 'http://www.indeed.com/m/'+ str(url_all[0]['href'])

In [None]:
job = {'title': title,
         'company': company,
         'location': location,
         'Job Description': jd}
df = pd.DataFrame.from_dict(job)
df