In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

# Setup the query and url

In [None]:
def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

In [None]:
url = get_url('software developer', 'san jose ca')
print(url)

https://www.indeed.com/jobs?q=software+developer&l=san+jose+ca


# Extract the html data

In [None]:
response = requests.get(url)

In [None]:
response

<Response [200]>

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
cards = soup.find_all('div', attrs={'class':'job_seen_beacon'})

In [None]:
len(cards)

15

# Prototype the model with a single record

In [None]:
card = cards[1]

In [None]:
print(card)

<div class="job_seen_beacon"><table cellpadding="0" cellspacing="0" class="jobCard_mainContent big6_visualChanges" role="presentation"><tbody><tr><td class="resultContent"><div class="heading4 color-text-primary singleLineTitle tapItem-gutter"><h2 class="jobTitle jobTitle-color-purple"><a aria-label="full details of Visual Basic programmers" class="jcs-JobTitle" data-hide-spinner="true" data-hiring-event="false" data-jk="38e008dde8e71c32" data-mobtk="1g29d81bcs7lg800" href="/rc/clk?jk=38e008dde8e71c32&amp;fccid=b9e65480db9411c1&amp;vjs=3" id="job_38e008dde8e71c32" role="button" target="_blank"><span title="Visual Basic programmers">Visual Basic programmers</span></a></h2></div><div class="heading6 company_location tapItem-gutter companyInfo"><span class="companyName">Tempo-TC</span><div class="companyLocation">Silicon Valley, CA</div></div><div class="heading6 tapItem-gutter metadataContainer noJEMChips salaryOnly"><div class="metadata estimated-salary-container"><span class="estimated

In [None]:
job_title = card.find('h2').find('span').text.strip()

In [None]:
print(job_title)

Visual Basic programmers


In [None]:
try:
  company = card.find('span', 'companyName').text.strip()
except AttributeError:
    company = 'N/A'

In [None]:
print(company)

Tempo-TC


In [None]:
job_location = card.find('div', {'class': 'companyLocation'}).text.strip()

In [None]:
print(job_location)

Silicon Valley, CA


In [None]:
job_summary = card.find('div', 'job-snippet').text.strip()

In [None]:
print(job_summary)

Relocation assistance for you and your family.
Bachelor's or Master's Degree.


In [None]:
post_date = card.find('span', 'date').text.strip()

In [None]:
print(post_date)

Posted30+ days ago


In [None]:
today = datetime.today().strftime('%Y-%m-%d')

In [None]:
print(today)

2022-05-05


In [None]:
try:
  job_salary = card.find('span', 'estimated-salary').text.strip()
except AttributeError:
    job_salary = 'N/A'

In [None]:
record = (job_title, company, job_location, job_salary, job_summary, post_date, today)

In [None]:
print(record)

('Visual Basic programmers', 'Tempo-TC', 'Silicon Valley, CA', 'Estimated $76.1K - $96.3K a year', "Relocation assistance for you and your family.\nBachelor's or Master's Degree.", 'Posted30+ days ago', '2022-05-05')


# Generalize the model with a function

In [None]:
def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.find('h2').find('span').text.strip()
    job_location = card.find('div', {'class': 'companyLocation'}).text.strip()
    post_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    job_summary = card.find('div', 'job-snippet').text.strip()
    
    # this does not exists for all jobs, so handle the exceptions
    try:
      job_salary = card.find('span', 'estimated-salary').text.strip()
    except AttributeError:
      job_salary = 'N/A' 

    try:
      company = card.find('span', 'companyName').text.strip()
    except AttributeError:
      company = 'N/A'
        
    record = (job_title, company, job_location, job_salary, job_summary, post_date, today)
    return record

In [None]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

# Get the next page

In [None]:
while True:
    try:
        url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', attrs={'class':'job_seen_beacon'})

    for card in cards:
        record = get_record(card)
        records.append(record)

# Putting it all together

In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.find('h2').find('span').text.strip()
    job_location = card.find('div', {'class': 'companyLocation'}).text.strip()
    post_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    job_summary = card.find('div', 'job-snippet').text.strip()
    
    # this does not exists for all jobs, so handle the exceptions
    try:
      job_salary = card.find('span', 'estimated-salary').text.strip()
    except AttributeError:
      job_salary = 'N/A' 

    try:
      company = card.find('span', 'companyName').text.strip()
    except AttributeError:
      company = 'N/A'
        
    record = (job_title, company, job_location, job_salary, job_summary, post_date, today)
    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', attrs={'class':'job_seen_beacon'})
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results.csv', 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'Salary', 'Summary', 'PostDate', 'ExtractDate'])
        writer.writerows(records)

In [None]:
# run the main program
main('ui designer', 'san jose, ca')

# Data Analysis

## Import Data

In [None]:
import pandas as pd
import seaborn as sns
import json
from urllib import request

In [None]:
results = pd.read_csv('results.csv')

In [None]:
results.head()

Unnamed: 0,JobTitle,Company,Location,Salary,Summary,PostDate,ExtractDate
0,Quantitative Analytics Professional - Data Sci...,Freddie Mac,"Hybrid remote in McLean, VA 22102",,Qualifying coursework may include—but is not l...,Posted30+ days ago,2022-05-05
1,Data Scientist,Thomson Reuters,"McLean, VA 22102",,"Do you have the skills necessary to manage, un...",Posted30+ days ago,2022-05-05
2,Data Scientist,"Strider, Inc.","Remote in Tysons Corner, VA",,Collaborate with subject matter experts to del...,EmployerActive 17 days ago,2022-05-05
3,new,Navy Federal Credit Union,"Hybrid remote in Vienna, VA 22180",,"Advanced skill data mining, data wrangling, an...",Posted7 days ago,2022-05-05
4,Data Scientist (Banking/Financials),Pitney Bowes,"Remote in Washington, DC",,Contribute to the creation of a new analytics ...,Posted30+ days ago,2022-05-05


In [None]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2476 entries, 0 to 2475
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   JobTitle     2476 non-null   object
 1   Company      2476 non-null   object
 2   Location     2476 non-null   object
 3   Salary       572 non-null    object
 4   Summary      2476 non-null   object
 5   PostDate     2476 non-null   object
 6   ExtractDate  2476 non-null   object
dtypes: object(7)
memory usage: 135.5+ KB


## Clean Data

In [None]:
results.nunique(dropna=False)

JobTitle       1003
Company        1084
Location        741
Salary          253
Summary        1736
PostDate         56
ExtractDate       2
dtype: int64

In [None]:
results = results.drop(columns=['Summary', 'ExtractDate'])

In [None]:
results.head()

Unnamed: 0,JobTitle,Company,Location,Salary,PostDate
0,Quantitative Analytics Professional - Data Sci...,Freddie Mac,"Hybrid remote in McLean, VA 22102",,Posted30+ days ago
1,Data Scientist,Thomson Reuters,"McLean, VA 22102",,Posted30+ days ago
2,Data Scientist,"Strider, Inc.","Remote in Tysons Corner, VA",,EmployerActive 17 days ago
3,new,Navy Federal Credit Union,"Hybrid remote in Vienna, VA 22180",,Posted7 days ago
4,Data Scientist (Banking/Financials),Pitney Bowes,"Remote in Washington, DC",,Posted30+ days ago


In [None]:
results = results[results.JobTitle != 'new']

In [None]:
results.head()

Unnamed: 0,JobTitle,Company,Location,Salary,PostDate
0,Quantitative Analytics Professional - Data Sci...,Freddie Mac,"Hybrid remote in McLean, VA 22102",,Posted30+ days ago
1,Data Scientist,Thomson Reuters,"McLean, VA 22102",,Posted30+ days ago
2,Data Scientist,"Strider, Inc.","Remote in Tysons Corner, VA",,EmployerActive 17 days ago
4,Data Scientist (Banking/Financials),Pitney Bowes,"Remote in Washington, DC",,Posted30+ days ago
6,Data Scientist I,American Express Global Business Travel,"Washington, DC",,Posted20 days ago


In [None]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1970 entries, 0 to 2475
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   JobTitle  1970 non-null   object
 1   Company   1970 non-null   object
 2   Location  1970 non-null   object
 3   Salary    469 non-null    object
 4   PostDate  1970 non-null   object
dtypes: object(5)
memory usage: 92.3+ KB


In [None]:
results.nunique(dropna=False)

JobTitle    1002
Company      911
Location     616
Salary       220
PostDate      47
dtype: int64

In [None]:
results.isnull().sum()

JobTitle       0
Company        0
Location       0
Salary      1501
PostDate       0
dtype: int64

## Data Analysis

In [None]:
results.describe()

Unnamed: 0,JobTitle,Company,Location,Salary,PostDate
count,1970,1970,1970,469,1970
unique,1002,911,616,219,47
top,Data Scientist,Amazon.com Services LLC,"Austin, TX",Salary,Posted30+ days ago
freq,90,49,155,14,1283


In [None]:
result_group = results.groupby('Company', as_index = False)


In [None]:
result_group.head()

Unnamed: 0,JobTitle,Company,Location,Salary,PostDate
0,Quantitative Analytics Professional - Data Sci...,Freddie Mac,"Hybrid remote in McLean, VA 22102",,Posted30+ days ago
1,Data Scientist,Thomson Reuters,"McLean, VA 22102",,Posted30+ days ago
2,Data Scientist,"Strider, Inc.","Remote in Tysons Corner, VA",,EmployerActive 17 days ago
4,Data Scientist (Banking/Financials),Pitney Bowes,"Remote in Washington, DC",,Posted30+ days ago
6,Data Scientist I,American Express Global Business Travel,"Washington, DC",,Posted20 days ago
...,...,...,...,...,...
2463,Senior Front End Developer,CENTRIC SOFTWARE INC,"Remote in Campbell, CA",Estimated $118K - $150K a year,Posted30+ days ago
2471,Senior Front End Engineer,Ihealth Labs Inc,"San Jose, CA 95131 (North Valley area)",,Posted15 days ago
2472,Full Stack Developer,"Fiserv, Inc.","Sunnyvale, CA 94085 (West Murphy area)",,Posted30+ days ago
2474,Embedded System Engineer,amaze systems Inc,"Mountain View, CA 94043",Estimated $118K - $150K a year,EmployerActive 13 days ago
