### This is a simple job aggerator program using web scraping technique. Here we are collecting job details from three job portals(shine.com, naukri.com, timesjobs.com). User have to just input 3 details i.e., job profile(mandatory), experience and location (these 2 are optional).This program aggregates the details from first page of all the portals sorted by relevance.

In [1]:
# Importing required libraries
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from urllib.parse import urljoin
from requests_html import AsyncHTMLSession
import pyppdf.patch_pyppeteer

In [2]:
# Code for getting profile name, experience and location
while(True):
    profile=input('Please enter the profile name or keyword for job search: ')
    experience=input('Please enter your experience level in years: ')
    location=input('Please enter a location for job search: ')

    # Profile name can not be empty
    if profile== '':
        print('\nProfile name cannot be empty!!!\nTry searching again!\n')
        continue
    else:
        break
    # Experience should be in numbers so checking it
    if experience !='':
        try:
            int(experience)
        except:
            print('\nEnter experience in numbers please!!!\nTry searching again!\n')
            continue
    else:
        break
    

Please enter the profile name or keyword for job search: data scientist
Please enter your experience level in years: 3
Please enter a location for job search: mumbai


In [3]:
# Creating four lists for the purpose of data collection
profile_name=[]
company_name=[]
post_date=[]
link =[]

# header will be used in web scraping(it is your system's id for accessing web )
header={'User-Agetn':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'}

In [4]:
# Below is the function for aggregating the data from shine.com job portal.
def shine_data(prof,exp,loc):
    prof=prof.strip()
    prof=prof.split()
    prof.append('jobs')
    prof='-'.join(prof)
    
    exp=exp.strip()
    loc=loc.strip()
    
    if exp !='':
        if int(exp)==0:
            exp="2"
        else:
            exp=str(int(exp)+3)
            
    loc=loc.strip()
    
    # Creating link with our search keywords
    if location == "":
        shine_url='https://www.shine.com/job-search/'+prof+'?loc='+loc+'&minexp='+exp
    else:
        shine_url='https://www.shine.com/job-search/'+prof+'-in-'+loc+'i?loc='+loc+'&minexp='+exp
    
    # Creating requests for scraping
    res=requests.get(shine_url, headers=header)
    if res.status_code!=200:
        return "Shine data NA"
    else:
        res_content=res.content
        soup=bs(res_content,'lxml')
        jobs_shine=soup.find_all('a',{'class':'cls_searchresult_a searchresult_link'})
        
        for jobs in jobs_shine:
            profile_name.append((jobs.find('h3').text).strip().replace(u'\xa0', ' '))
            company_name.append((jobs.find('li',{'class':'snp_cnm cls_cmpname cls_jobcompany'}).text).strip())
            post_date.append((jobs.find('li',{'class':'time share_links jobDate'}).text).strip())
            link.append(urljoin(shine_url,jobs['href']))
        return "done"

In [5]:
# Creating function for scraping from naukri.com
# As naukri uses javascript so we used AsyncHTMLsession to extract the data and hence the function had to be 
# asynchronous function.
async def naukri_data(prof,exp,loc):
    prof=prof.strip()
    prof=prof.split()
    prof.append('jobs')
    prof='-'.join(prof)
    
    exp=exp.strip()
    loc=loc.strip()
    
    # Creating link with our search keywords    
    if location == "":
        naukri_url='https://www.naukri.com/'+prof+'?k='+prof+'&experience'+exp
    else:
        naukri_url='https://www.naukri.com/'+prof+'-in-'+loc+'?k='+prof+'&l='+loc+'&experience='+exp
    
    # Creating html_requests session for scraping
    session=AsyncHTMLSession()
    res=await session.get(naukri_url, headers=header)
    
    if res.status_code!=200:
        return "Shine data NA"
    else:
        await res.html.arender()

        for i in range(len(res.html.find('.jobTupleHeader'))):
            profile_name.append(res.html.find('.jobTupleHeader')[i].find('a')[0].text)
            company_name.append(res.html.find('.jobTupleHeader')[i].find('[href]')[1].attrs['title'])
            post_date.append(res.html.find('.jobTupleFooter.mt-20')[i].find('span.fleft.fw500')[-1].text)
            link.append(res.html.find('.jobTupleHeader')[i].find('[href]')[0].attrs['href'])
        await session.close()
        return "done"

In [6]:
# Scraping from timesjobs.com web portal.
def timesjobs_data(prof,exp,loc):
    prof=prof.strip()
    prof=prof.split()
    prof='+'.join(prof)
    
    exp=exp.strip()
    loc=loc.strip()
        
    if location == "":
        tj_url='https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords='+prof+'&txtLocation=&cboWorkExp1='+exp
    else:
        tj_url='https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords='+prof+'&txtLocation='+loc+'&cboWorkExp1='+exp
        
    res=requests.get(tj_url, headers=header)
    if res.status_code!=200:
        return "Timesjobs data NA"
    else:
        res_content=res.content
        soup=bs(res_content,'lxml')
        
        for i in range(len(soup.find_all('li',{'class':'clearfix job-bx wht-shd-bx'}))):
            # Profile Name
            profile_name.append(soup.find_all('li',{'class':'clearfix job-bx wht-shd-bx'})[i].find_all('a')[0].text.strip())
            company_name.append(soup.find_all('li',{'class':'clearfix job-bx wht-shd-bx'})[i].find_all('h3')[0].text.strip())
            post_date.append(soup.find_all('span',{'class':'sim-posted'})[i].find_all('span')[-1].text.strip())
            link.append(soup.find_all('li',{'class':'clearfix job-bx wht-shd-bx'})[i].find_all('a')[0]['href'])
        return "done"

In [7]:
# Executing functions
shine_data(profile,experience,location)

'done'

In [8]:
await naukri_data(profile,experience,location)

'done'

In [9]:
timesjobs_data(profile,experience,location)

'done'

In [10]:
# Setting display option maximum for columns and rows so that no rows or columns are discarded
pd.set_option('display.max_rows',100)
pd.set_option('display.max_colwidth', None)

job_listing=pd.DataFrame({'Profile':profile_name,'Company':company_name,'Recency':post_date,'URL':link})

In [11]:
job_listing

Unnamed: 0,Profile,Company,Recency,URL
0,Data Scientist,NEEMTREE INTERNET PVT LTD,"Jun 17, 2020",https://www.shine.com/jobs/data-scientist/neemtree-internet-pvt-ltd/10402056/
1,Data Scientist,Denave India Pvt Ltd.,"Jun 17, 2020",https://www.shine.com/jobs/data-scientist/denave-india-pvt-ltd/10566781/
2,Data Scientist,Bprise,"Jun 13, 2020",https://www.shine.com/jobs/data-scientist/bprise/10794585/
3,Data Scientist,Angel Broking Limited,"Jun 12, 2020",https://www.shine.com/jobs/data-scientist/angel-broking-limited/10058390/
4,Data Scientist,BigTree Entertainment,"Jun 15, 2020",https://www.shine.com/jobs/data-scientist/bigtree-entertainment/9874466/
5,Data Scientist,Analytos,"Jun 11, 2020",https://www.shine.com/jobs/data-scientist/analytos/10572504/
6,AI Developer/ Data Scientist,OPEXAI BUSINESS CONSULTING LLP,"Jun 11, 2020",https://www.shine.com/jobs/ai-developer-data-scientist/opexai-business-consulting-llp/10658144/
7,Data Scientist,BigTree Entertainment,"Jun 13, 2020",https://www.shine.com/jobs/data-scientist/bigtree-entertainment/9874456/
8,Data Scientist,ALLERIN TECHNOLOGIES Pvt Ltd.,"Jun 04, 2020",https://www.shine.com/jobs/data-scientist/allerin-technologies-pvt-ltd/10186210/
9,Data Scientist,Mastek Ltd,"May 18, 2020",https://www.shine.com/jobs/data-scientist/mastek-ltd/10817872/
