<a href="https://colab.research.google.com/github/lifepopkay/Tech-Monies/blob/Scraping/Main_dataScrappingIndeedScript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**To do:**
1. Give Visual Indication for Users. *add verbose*
 1. Current Page being extracted.
 2. Total Jobs extracted.
 3. Output File location
2. Ensure Non-breaking of code:
 1. When `pages` given are not available. *Stop, when no more pages to process.*
 2. When Handling a lot of URLS. *Add Delays before requesting for more URLs.*
 3. Unexpected behaviours. *Need to be discovered.*
3. Data Cleaning:
 1. Extract Job, Contract Type & other details form `salaryDescription`.
 2. Extract Info from `jobDescription`.
 3. Attach date of posting using locale date.





In [None]:
# Imports
# Data Handling
import numpy as np
import pandas as pd
import re

# Web Element Manipulation
from bs4 import BeautifulSoup
import requests

# define headers for connection string
headers = {"User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}

# Functions
def find_jobs(what, where, baseUrl, nPage=1, verbose=True):
  # Handling URL
  jobTitle = '+'.join(what.split())
  location = '+'.join(where.split())
  
  # Initialize list for all jobs data
  jobs = []

  # create url for scraping
  for i in range(nPage):
    if i>0:
      targetUrl = baseUrl+"/jobs?q="+jobTitle+"&l="+location+"&sort=date"+"&start="+str(i * 10)
    else:
      targetUrl = baseUrl+"/jobs?q="+jobTitle+"&l="+location+"&sort=date"
    if verbose:
      print("\n+++++ Extracting Data from Page", i+1, " ++++++")
      print("Extracting data from URL:", targetUrl)
    scrap_jobs(jobs, targetUrl, verbose)

  
  # attach Job Description
  attach_jd(jobs, baseUrl)
  
  # Check scrapped jobs
  return pd.DataFrame(jobs)


def scrap_jobs(jobs, url, verbose):
  # get the static page to scrap everything apart from job description
  response = requests.get(url, headers=headers)
  if verbose:
    if response.ok:
      print("Connected to", url, "Successfully.")
    else:
  	  print("Connection denied with response code:", response.status_code)
  html = response.text

  # Create soup
  soup = BeautifulSoup(html, 'html.parser')

  # Search Area
  block=soup.find('ul',attrs={'class': re.compile('jobsearch-ResultsList')})
  
  # iterate through job cards
  for card in block.find_all('div', {'class': 'job_seen_beacon'}):
    jobs.append(scrap_cards(card))
  if verbose:
    print("Found Total", len(jobs), "so far.")
    
def scrap_cards(card):
  # temporary dictionary
    tempDict = dict()
  
    # Job Title & ID:
    title = card.find('h2',{'class':re.compile('jobTitle')})
    if not(isinstance(title, type(None))):
      tempDict['title']=title.find('a').text
      tempDict['id']=title.find('a').attrs['id']
    
    # Company Name:
    company = card.find('span',{'class':'companyName'})
    if not(isinstance(company, type(None))):
      tempDict['company']=company.text
    
    # Location:
    location = card.find('div',{'class':'companyLocation'})
    if not(isinstance(location, type(None))):
      tempDict['location']=location.text
                
    # Links: these Href links will take us to full job description
    link = card.find('a', {'class': re.compile('jcs-JobTitle')})
    if not(isinstance(link, type(None))):
      tempDict['link']=link['href']
    
    # Salary & Contract Type, if available:'
    # picking all text, cleaning will be done later
    salaryCard = card.find('div',{'class': re.compile('metadataContainer')})
    if not(isinstance(salaryCard, type(None))):
      tempDict['salaryDesc'] = salaryCard.text
      # salary = salaryCard.find('div',{'class': re.compile('salary')})
      # if not(isinstance(salary, type(None))):
      #   tempDict['salary']=salary.text
      # contract = salaryCard.find('div',{'class': 'metadata'})
      # if not(isinstance(contract, type(None))):
      #   tempDict['contractType']=contract.text

    # Job Post Date:
    postDate = card.find('span', attrs={'class': 'date'})
    if not(isinstance(postDate, type(None))):
      tempDict['postDate']=postDate.find(text=True, recursive=False)
      
    # Contract Type:
    # contractType = card.find('div', attrs={'class': 'attribute_snippet'})
    # if not(isinstance(contractType, type(None))):
    #   tempDict['contractType']=contractType.text

    # Put everything together in a list of lists for the default dictionary
    return tempDict

def attach_jd(jobs, baseUrl):
  for dict in jobs:
    response = requests.get(baseUrl+dict['link'], headers=headers)
    if response.ok:
      html_ = response.text
      # Create soup
      soup_ = BeautifulSoup(html_, 'html.parser')
      dict['JobDescription'] = soup_.find('div',{'class':'jobsearch-jobDescriptionText'}).text
    else:
      dict['JobDescription'] = 'Not Available'
  return jobs



In [None]:
# Enter Job Title, Location/Country, primary URL, total pages for extraction & if any 
data = find_jobs(what='Data Analyst', where='USA', baseUrl='https://www.indeed.com', nPage=5, verbose=True)


+++++ Extracting Data from Page 1  ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date
Connected to https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date Successfully.
Found Total 15 so far.

+++++ Extracting Data from Page 2  ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date&start=10
Connected to https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date&start=10 Successfully.
Found Total 30 so far.

+++++ Extracting Data from Page 3  ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date&start=20
Connected to https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date&start=20 Successfully.
Found Total 45 so far.

+++++ Extracting Data from Page 4  ++++++
Extracting data from URL: https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date&start=30
Connected to https://www.indeed.com/jobs?q=Data+Analyst&l=USA&sort=date&start=30 Successfully.
Found Total 60 so far

In [None]:
data.tail(10)

Unnamed: 0,title,id,company,location,link,postDate,JobDescription,salaryDesc
65,Data Report Analyst,job_cc9fb220f9698069,Sixteenth Street Community Health Centers,"Milwaukee, WI",/company/Sixteenth-Street-Community-Health-Cen...,Today,The Data Report Analyst is responsible for opt...,Estimated $58.2K - $73.6K a yearFull-time
66,Web Marketing Data Analyst,job_e0385748c3f495a4,The Loot Company,Remote,/rc/clk?jk=e0385748c3f495a4&fccid=7199ce5cb1bf...,Today,\nDo you love entertainment and pop culture? W...,Estimated $86K - $109K a yearFull-time
67,Principle Data Analyst,job_515d607cce3495b7,Fidelity Investments,"Temporarily Remote in Boston, MA 02210+6 locat...",/rc/clk?jk=515d607cce3495b7&fccid=deb234f9dd3e...,Today,\nJob Description:\n\n The Role\n The Data Ana...,Estimated $92.4K - $117K a yearFull-time
68,Virtual Business Intelligence Analyst,job_27f2e68a6008eff3,Texas Service Center,"Remote in San Antonio, TX 78249",/rc/clk?jk=27f2e68a6008eff3&fccid=03b9512b04ad...,Today,\nDescription SHIFT: Work From Home\n SCHEDUL...,Full-time
69,Remote Software Business Analyst II,job_bd7e02cbf0600296,Input 1,+2 locationsRemote,/rc/clk?jk=bd7e02cbf0600296&fccid=37cf933b495b...,Today,\nCompensation: $70K - $95K (Depending on Expe...,"$70,000 - $95,000 a yearFull-time"
70,Business Analyst,job_63fcca6d208f1198,"Steel Dynamics, Inc.","Fort Wayne, IN 46804",/rc/clk?jk=63fcca6d208f1198&fccid=0d26fb4585b7...,Today,\n Overview: \n \n We are seeking a knowledg...,Estimated $64.6K - $81.8K a yearFull-time
71,Digital Training and Data Analyst (Work from H...,job_a230e94130747d10,Geisinger,"Remote in Danville, PA+1 location",/rc/clk?jk=a230e94130747d10&fccid=6eab821d237c...,Today,\nJob Summary The ensures the quality of porta...,Full-time
72,HIIM Data Integrity Quality Analyst-Auditor,job_2eef915e29dc74c2,Community Health Systems,"Franklin, TN",/rc/clk?jk=2eef915e29dc74c2&fccid=e210693fb04e...,Today,\nSummary: The HIIM Data Integrity Quality Ana...,Estimated $37.1K - $46.9K a yearWeekend availa...
73,Business Analyst,job_2e46d733bc7fd446,OmniSource,"Fort Wayne, IN 46804",/rc/clk?jk=2e46d733bc7fd446&fccid=0fcf74ac79ef...,Today,\n Overview: \n \n We are seeking a knowledg...,Estimated $64.6K - $81.8K a yearFull-time
74,Sr Data Governance Analyst,job_544d895c1a87814f,Citizens,"Westwood, MA 02090+3 locations",/rc/clk?jk=544d895c1a87814f&fccid=e4c6a8c0c5c0...,Today,\nDescription\n The Senior Data Governance Ana...,


In [None]:
# class="metadata estimated-salary-container" <-- estimated
# class="metadata salary-snippet-container"
# jcs-JobTitle css-jspxzf eu4oa1w0
# 