<a href="https://colab.research.google.com/github/lifepopkay/Tech-Monies/blob/Scraping/Main_dataScrappingIndeedScript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**To do:**
1. Give Visual Indication for Users. *add verbose*
 1. Current Page being extracted. ✅
 2. Total Jobs extracted. ✅
 3. Output File location
2. Ensure Non-breaking of code:
 1. When `pages` given are not available. *Stop, when no more pages to process.* ✅
 2. When Handling a lot of URLS. *Add Delays before requesting for more URLs.*
 3. Unexpected behaviours. *Need to be discovered.*
3. Data Cleaning:
 1. Extract Job, Contract Type & other details form `salaryDescription`.
 2. Extract Info from `jobDescription`.
 3. Attach date of posting using locale date.





In [37]:
# Imports
# Data Handling
import numpy as np
import pandas as pd
import re

# Web Element Manipulation
from bs4 import BeautifulSoup
import requests

# define headers for connection string
headers = {"User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}

# Functions
def find_jobs(what, where, baseUrl, nPage=1, verbose=True):
  # Handling URL
  jobTitle = '+'.join(what.split())
  location = '+'.join(where.split())
  
  # Initialize list for all jobs data
  jobs = []

  # Initialize stopping criteria
  totalJobs = -1
  mPage = nPage

  # Initialize Page Index
  page = 0

  # starting
  if verbose:
    print('===========| Start |============')
    print('===| Scrapping Job Postings |===')

  # create url for scraping
  while page < min(nPage, mPage):
    if page>0:
      targetUrl = baseUrl+"/jobs?q="+jobTitle+"&l="+location+"&sort=date"+"&start="+str(page * 10)
    else:
      targetUrl = baseUrl+"/jobs?q="+jobTitle+"&l="+location+"&sort=date"

    if verbose:
      print("\n+++++ Extracting Data from Page", page+1, "++++++")
      print("Extracting data from URL:", targetUrl)
    jobs, totalJobs, mPage = scrap_jobs(jobs, targetUrl, verbose, totalJobs, mPage)
    page += 1
  
  # attach JD
  if verbose:
    print('\n===| Attaching Job Description |===')
  
  # attach Job Description
  attach_jd(jobs, baseUrl)

  # drop duplicates
  jobsDF = pd.DataFrame(jobs)
  jobsDF.drop_duplicates(inplace=True)

  # finishing
  if verbose:
    print('\n===| Cleaning Up |===')
    print("Total", jobsDF.shape[0], "unique jobs found.")
    print('\n========| Done |=========')
  
  # Check scrapped jobs
  return jobsDF


def scrap_jobs(jobs, url, verbose, totalJobs, mPage):

  # get the static page to scrap everything apart from job description
  response = requests.get(url, headers=headers)
  if verbose:
    if response.ok:
      print("Connected to", url, "Successfully.")
    else:
  	  print("Connection denied with response code:", response.status_code)
  html = response.text

  # Create soup
  soup = BeautifulSoup(html, 'html.parser')

  # Get Actual value for max Page
  if totalJobs == -1:
    totalJobs = int(re.search(r'of (.*) jobs', soup.find('div', {'id': 'searchCountPages'}).text)[1].replace(",", ""))
    # Estimate Actual Page number
    mPage = (totalJobs//15) + 1

  # Search Area
  block=soup.find('ul',attrs={'class': re.compile('jobsearch-ResultsList')})
  
  # Check for stopping criteria
  jobCards = block.find_all('div', {'class': 'job_seen_beacon'})
  # if len(jobCards) < 15:
  #   proceed = 0
  
  # iterate through job cards
  for card in block.find_all('div', {'class': 'job_seen_beacon'}):
    jobs.append(scrap_cards(card))
  if verbose:
    print("Found Total", len(jobs), "jobs so far.")
  return jobs, totalJobs, mPage
    
def scrap_cards(card):
  # temporary dictionary
    tempDict = dict()
  
    # Job Title & ID:
    title = card.find('h2',{'class':re.compile('jobTitle')})
    if not(isinstance(title, type(None))):
      tempDict['title']=title.find('a').text
      tempDict['id']=title.find('a').attrs['id']
    
    # Company Name:
    company = card.find('span',{'class':'companyName'})
    if not(isinstance(company, type(None))):
      tempDict['company']=company.text
    
    # Location:
    location = card.find('div',{'class':'companyLocation'})
    if not(isinstance(location, type(None))):
      tempDict['location']=location.text
                
    # Links: these Href links will take us to full job description
    link = card.find('a', {'class': re.compile('jcs-JobTitle')})
    if not(isinstance(link, type(None))):
      tempDict['link']=link['href']
    
    # Salary & Contract Type, if available:'
    # picking all text, cleaning will be done later
    salaryCard = card.find('div',{'class': re.compile('metadataContainer')})
    if not(isinstance(salaryCard, type(None))):
      tempDict['salaryDesc'] = salaryCard.text
      # salary = salaryCard.find('div',{'class': re.compile('salary')})
      # if not(isinstance(salary, type(None))):
      #   tempDict['salary']=salary.text
      # contract = salaryCard.find('div',{'class': 'metadata'})
      # if not(isinstance(contract, type(None))):
      #   tempDict['contractType']=contract.text

    # Job Post Date:
    postDate = card.find('span', attrs={'class': 'date'})
    if not(isinstance(postDate, type(None))):
      tempDict['postDate']=postDate.find(text=True, recursive=False)
      
    # Contract Type:
    # contractType = card.find('div', attrs={'class': 'attribute_snippet'})
    # if not(isinstance(contractType, type(None))):
    #   tempDict['contractType']=contractType.text

    # Put everything together in a list of lists for the default dictionary
    return tempDict

def attach_jd(jobs, baseUrl):
  for dict in jobs:
    response = requests.get(baseUrl+dict['link'], headers=headers)
    if response.ok:
      html_ = response.text
      # Create soup
      soup_ = BeautifulSoup(html_, 'html.parser')
      dict['JobDescription'] = soup_.find('div',{'class':'jobsearch-jobDescriptionText'}).text
    else:
      dict['JobDescription'] = 'Not Available'
  return jobs



In [38]:
# Enter Job Title, Location/Country, primary URL, total pages for extraction & if any 
data = find_jobs(what='Refuse Collector', where='USA', baseUrl='https://www.indeed.com', nPage=10, verbose=True)

===| Scrapping Job Postings |===

+++++ Extracting Data from Page 1 ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date
Connected to https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date Successfully.
Found Total 15 jobs so far.

+++++ Extracting Data from Page 2 ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date&start=10
Connected to https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date&start=10 Successfully.
Found Total 27 jobs so far.

+++++ Extracting Data from Page 3 ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date&start=20
Connected to https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date&start=20 Successfully.
Found Total 42 jobs so far.

+++++ Extracting Data from Page 4 ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Refuse+Collector&l=USA&sort=date&start=30
Connected to https://www.indeed.com/jobs?q=R

In [36]:
data.tail(20)

Unnamed: 0,title,id,company,location,link,salaryDesc,postDate,JobDescription
37,Heavy Equipment Operator II (Refuse Collector),job_39d39e97ad4bf1a4,City of St. Louis,"St. Louis, MO 63103",/rc/clk?jk=39d39e97ad4bf1a4&fccid=39d61f3b81f2...,"$2,836 a month",Posted 30+ days ago,\n\nIncumbents in this position are responsibl...
38,Refuse Collector,job_c6ea52427d3af483,"City of Boca Raton, FL","Boca Raton, FL",/rc/clk?jk=c6ea52427d3af483&fccid=896b6028b67e...,$17.77 - $30.53 an hourFull-time,Posted 30+ days ago,\n\n\nJob Description\n\n\n\n\nDo you want to ...
39,"Refuse Collector I,II, & III",job_46df4f2d7029e66f,City of Norfolk,"Norfolk, VA 23510",/rc/clk?jk=46df4f2d7029e66f&fccid=d732bf1fe691...,Full-time,Posted 30+ days ago,JOB \nThe City of Norfolk's Department of Publ...
40,REFUSE COLLECTOR (SANITATION),job_4abaed1312d10fbe,"City of Roswell, Georgia","Roswell, GA",/rc/clk?jk=4abaed1312d10fbe&fccid=75cf66606588...,$15.00 - $16.56 an hour,Posted 30+ days ago,"\n\n\n\nCity of Roswell, Georgia JOB DESCRIPT..."
41,Refuse Collector,job_2af58d48c0cbfc1d,The City of Thomasville,"Thomasville, GA 31792",/rc/clk?jk=2af58d48c0cbfc1d&fccid=2b01af59c105...,Estimated $29.8K - $37.8K a yearFull-time +1,Posted 30+ days ago,Position reports to Solid Waste Foreman and is...
42,Refuse Collector - Part Time,job_ab8f7bf0c7405ac4,City of Hanford,"Hanford, CA 93230",/rc/clk?jk=ab8f7bf0c7405ac4&fccid=dcae9a1ce511...,Estimated $34.8K - $44K a yearPart-time,Posted 30+ days ago,JOB \nThe City of Hanford is seeking to fill t...
43,Refuse Collector,job_17147b84a27f54c3,PAE,"Williamsburg, VA",/rc/clk?jk=17147b84a27f54c3&fccid=faec53d26b1f...,Estimated $30.4K - $38.4K a yearFull-time,Posted 30+ days ago,\nSupporting the Most Exciting and Meaningful ...
44,Refuse/Recycle Collector,job_83e77b7c42d191c5,City of Olympia,"Olympia, WA 98507 (Downtown area)",/rc/clk?jk=83e77b7c42d191c5&fccid=fe8f70b5af0b...,Estimated $40.9K - $51.7K a yearFull-time,Posted 30+ days ago,JOB \nPlease fill out your application as full...
45,Refuse Collector,job_61b7ef9b7589a8fd,City of Goose Creek,"Goose Creek, SC 29445",/rc/clk?jk=61b7ef9b7589a8fd&fccid=87afe4c5d3ea...,$14.50 an hourFull-time,Posted 30+ days ago,\nJOB SUMMARY\n This position is responsible f...
46,Refuse Collector (Temp. to Perm.) - Dept. of P...,job_d90ce932558358b4,"City of Manchester, NH","Manchester, NH",/rc/clk?jk=d90ce932558358b4&fccid=bbf3ac2aa63c...,$17.82 an hourFull-time +1,Posted 30+ days ago,Job Description \nGrade 12 \nGeneral Statement...


In [17]:
print(re.search(r'of (.*) jobs', 'Page 2 of 39,480 jobs')[1])

39,480


In [19]:
34//15

2

In [20]:
29//15

1