In [44]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
import math
import time
from collections import OrderedDict

### Search Page

In [45]:
url = 'https://www.naukri.com/data-analyst-jobs-in-india'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
request = urllib.request.Request(url,headers={'User-Agent': user_agent})
response = urllib.request.urlopen(request)
html = response.read()

In [46]:
soup = BeautifulSoup(html, "lxml")

### Job-Links in the Page

In [61]:
all_link = []
for link in soup.select('a[href]'):
    if 'job-listings' in str(link.get('href')):
        all_link.append(link.get('href'))

#all_link[2]

### Trial with one link

In [62]:
jd_url = all_link[2]

jd_source_mid = urllib.request.Request(jd_url,headers={'User-Agent': user_agent})
jd_source = urllib.request.urlopen(jd_source_mid)
jd_soup = BeautifulSoup(jd_source,"lxml")

In [63]:
#basic details

#location
location = jd_soup.find("div",{"class":"loc"}).getText().strip()
print ('Location: ' + location + '\n\n')

# Job Description
jd_text = jd_soup.find("ul",{"itemprop":"description"}).getText().strip()
print ('Description: '+ jd_text + '\n\n')

# Experience Level
experience = jd_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
print ('Experince: ' + experience)

Location: Pune


Description: Department          : IPO/ Big Data Analyst   Profile     Well conversant with Big Data analysis work.  	Hand on experience of preparing reports ,data interpretation and data cleaning.  	Good Communication Skills. Good Power Point Presentation and Excel Skills  	Basic understanding of Big Data Tools and usage.


Experince: 3 - 4 yrs


In [64]:
#Labels

labels = ['Salary', 'Industry', 'Functional Area', 'Role Category', 'Design Role']
role_info = []
for content in jd_soup.find("div",{"class":"jDisc mt20"}).contents:
    if len(str(content).replace(' ','')) != 0:
        info = content.getText().split(":")[-1].strip()
        role_info.append(info)
        
role_info_dict = {label: role_info for label, role_info in zip(labels, role_info)}
role_info_dict


{'Design Role': 'Business Analyst',
 'Functional Area': 'IT Software - DBA   ,     Datawarehousing',
 'Industry': 'Construction  /    Engineering  /    Cement  /    Metals',
 'Role Category': 'System Design/Implementation/ERP/CRM',
 'Salary': 'Not Disclosed by Recruiter'}

In [65]:
# Role Level Information

labels = ['Salary', 'Industry', 'Functional Area', 'Role Category', 'Design Role']
role_info = [content.getText().split(':')[-1].strip() for content in jd_soup.find("div",{"class":"jDisc mt20"}).contents 
 if len(str(content).replace(' ',''))!=0]

role_info_dict = {label: role_info for label, role_info in zip(labels, role_info)}
print (role_info_dict)

{'Salary': 'Not Disclosed by Recruiter', 'Industry': 'Construction  /    Engineering  /    Cement  /    Metals', 'Functional Area': 'IT Software - DBA   ,     Datawarehousing', 'Role Category': 'System Design/Implementation/ERP/CRM', 'Design Role': 'Business Analyst'}


In [66]:
# Skills required

key_skills = []
for skill in jd_soup.find("div",{"class":"ksTags"}).getText().split('  ')[1:]:
    key_skills.append(skill)
print (key_skills)

['excel', 'data analysis', 'data interpretation', 'report preparation', 'big data', 'power point presentation', 'data cleaning ']


In [67]:
# Education Level
edu_info = [content.getText().split(':') for content in jd_soup.find("div",{"itemprop":"educationRequirements"}).contents 
 if len(str(content).replace(' ',''))!=0]

edu_info_dict = {label.strip(): edu_info.strip() for label, edu_info in edu_info}

# Sometimes the education information for one of the degrees can be missing
edu_labels = ['UG', 'PG', 'Doctorate']
for l in edu_labels:
    if l not in edu_info_dict.keys():
        edu_info_dict[l] = ''
print (edu_info_dict)

{'UG': 'B.Tech/B.E. - Any Specialization, Computers', 'PG': 'Any Postgraduate - Any Specialization', 'Doctorate': 'Doctorate Not Required'}


In [68]:
# Company Info
company_name = jd_soup.find("div",{"itemprop":"hiringOrganization"}).contents[1].p.getText()
print (company_name)

Hyster-Yale Lift Trucks India Pvt Ltd


In [69]:
naukri_df = pd.DataFrame()
column_names = ['Location', 'Link', 'Job Description', 'Experience','Salary', 'Industry', 'Functional Area', 'Role Category', 
                'Design Role', 'Skills','Company Name', 
                'UG','PG','Doctorate']

In [70]:
from collections import OrderedDict
df_dict = OrderedDict({'Location':location, 'Link':all_link[0],'Job Description':jd_text,'Experience':experience,
                       'Skills':key_skills,'Company Name':company_name})
df_dict.update(role_info_dict)
df_dict.update(edu_info_dict)
df_dict

OrderedDict([('Location', 'Pune'),
             ('Link',
              'https://www.naukri.com/job-listings-Business-Analyst-Big-data-Analyst-Mindgensolutions-Bengaluru-6-to-10-years-030718901257?src=jobsearchDesk&sid=15306470299349&xp=1&px=1'),
             ('Job Description',
              'Department          : IPO/ Big Data Analyst   Profile     Well conversant with Big Data analysis work.  \tHand on experience of preparing reports ,data interpretation and data cleaning.  \tGood Communication Skills. Good Power Point Presentation and Excel Skills  \tBasic understanding of Big Data Tools and usage.'),
             ('Experience', '3 - 4 yrs'),
             ('Skills',
              ['excel',
               'data analysis',
               'data interpretation',
               'report preparation',
               'big data',
               'power point presentation',
               'data cleaning ']),
             ('Company Name', 'Hyster-Yale Lift Trucks India Pvt Ltd'),
             (

In [71]:
naukri_df = naukri_df.append(df_dict,ignore_index=True)
naukri_df


Unnamed: 0,Company Name,Design Role,Doctorate,Experience,Functional Area,Industry,Job Description,Link,Location,PG,Role Category,Salary,Skills,UG
0,Hyster-Yale Lift Trucks India Pvt Ltd,Business Analyst,Doctorate Not Required,3 - 4 yrs,"IT Software - DBA , Datawarehousing",Construction / Engineering / Cement /...,Department : IPO/ Big Data Analyst ...,https://www.naukri.com/job-listings-Business-A...,Pune,Any Postgraduate - Any Specialization,System Design/Implementation/ERP/CRM,Not Disclosed by Recruiter,"[excel, data analysis, data interpretation, re...","B.Tech/B.E. - Any Specialization, Computers"


In [72]:
# Reordering the columns to a preferred order as specified

naukri_df = naukri_df.reindex(columns=column_names)
naukri_df

Unnamed: 0,Location,Link,Job Description,Experience,Salary,Industry,Functional Area,Role Category,Design Role,Skills,Company Name,UG,PG,Doctorate
0,Pune,https://www.naukri.com/job-listings-Business-A...,Department : IPO/ Big Data Analyst ...,3 - 4 yrs,Not Disclosed by Recruiter,Construction / Engineering / Cement /...,"IT Software - DBA , Datawarehousing",System Design/Implementation/ERP/CRM,Business Analyst,"[excel, data analysis, data interpretation, re...",Hyster-Yale Lift Trucks India Pvt Ltd,"B.Tech/B.E. - Any Specialization, Computers",Any Postgraduate - Any Specialization,Doctorate Not Required


In [73]:
# Together into one function
labels = ['Salary', 'Industry', 'Functional Area', 'Role Category', 'Design Role']
edu_labels = ['UG', 'PG', 'Doctorate']
naukri_df = pd.DataFrame()
           
for page_url in all_link:
    #page_url = base_url+str(page)
    jd_source = urllib.request.Request(page_url, headers = {'User-Agent' : user_agent})
    jd_html = urllib.request.urlopen(jd_source).read()
    jd_soup = BeautifulSoup(jd_html,"lxml")
    #all_links = [link.get('href') for link in soup.findAll('a') if 'job-listings' in  str(link.get('href'))]
    #for url in all_links:

    try:
        jd_text = jd_soup.find("ul",{"itemprop":"description"}).getText().strip()
        location = jd_soup.find("div",{"class":"loc"}).getText().strip()
        experience = jd_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
            
        role_info = [content.getText().split(':')[-1].strip() for content in jd_soup.find("div",{"class":"jDisc mt20"}).contents if len(str(content).replace(' ',''))!=0]
        role_info_dict = {label: role_info for label, role_info in zip(labels, role_info)}
            
        key_skills = '|'.join(jd_soup.find("div",{"class":"ksTags"}).getText().split('  '))[1:]

        edu_info = [content.getText().split(':') for content in jd_soup.find("div",{"itemprop":"educationRequirements"}).contents if len(str(content).replace(' ',''))!=0]
        edu_info_dict = {label.strip(): edu_info.strip() for label, edu_info in edu_info}
        for l in edu_labels:
            if l not in edu_info_dict.keys():
                edu_info_dict[l] = ''

        company_name = jd_soup.find("div",{"itemprop":"hiringOrganization"}).contents[1].p.getText().strip()
        
    except AttributeError:
        continue
    df_dict = OrderedDict({'Location':location, 'Link':url,'Job Description':jd_text,'Experience':experience,'Skills':key_skills,'Company Name':company_name})
    df_dict.update(role_info_dict)
    df_dict.update(edu_info_dict)
    naukri_df = naukri_df.append(df_dict,ignore_index=True)
    time.sleep(1)

In [74]:
naukri_df

Unnamed: 0,Company Name,Design Role,Doctorate,Experience,Functional Area,Industry,Job Description,Link,Location,PG,Role Category,Salary,Skills,UG
0,Mindgensolutions,Business Analyst,Doctorate Not Required,6 - 10 yrs,Analytics & Business Intelligence,IT-Software / Software Services,Job Description -Business Analyst -Big data...,https://www.naukri.com/data-analyst-jobs-in-india,Bengaluru,Any Postgraduate - Any Specialization,Analytics & BI,Not Disclosed by Recruiter,Business Analysis|Statistical Modeling|Data An...,"Any Graduate - Any Specialization, B.Sc - Stat..."
1,Hyster-Yale Lift Trucks India Pvt Ltd,Business Analyst,Doctorate Not Required,3 - 4 yrs,"IT Software - DBA , Datawarehousing",Construction / Engineering / Cement /...,Department : IPO/ Big Data Analyst ...,https://www.naukri.com/data-analyst-jobs-in-india,Pune,Any Postgraduate - Any Specialization,System Design/Implementation/ERP/CRM,Not Disclosed by Recruiter,excel|data analysis|data interpretation|report...,"B.Tech/B.E. - Any Specialization, Computers"
2,Cactus Communications Pvt. Ltd.,Data Analyst,Doctorate Not Required,2 - 5 yrs,Analytics & Business Intelligence,KPO / Research / Analytics,Want to work with India's no.1 mid-sized compa...,https://www.naukri.com/data-analyst-jobs-in-india,Mumbai(Chakala),Any Postgraduate,Analytics & BI,Not Disclosed by Recruiter,data science|big data|statistical modeling|sta...,Any Graduate
3,The Akshaya Patra Foundation,Quality Assurance/Quality Control Manager,Doctorate Not Required,4 - 6 yrs,"Production , Manufacturing , Maint...",FMCG / Foods / Beverage,Qualification & Experience: Experience in Food...,https://www.naukri.com/data-analyst-jobs-in-india,Bengaluru,"MS/M.Sc(Science) - Any Specialization, Food Te...",Production/Manufacturing/Maintenance,Not Disclosed by Recruiter,food technology|food quality control|customer ...,"B.Sc - Any Specialization, Food Technology"
4,INFOR (INDIA) PRIVATE LIMITED,Data Analyst,,5 - 6 yrs,Analytics & Business Intelligence,IT-Software / Software Services,"Infor Cloud Management is a strategic, global ...",https://www.naukri.com/data-analyst-jobs-in-india,Hyderabad,"CA, MBA/PGDM - Finance",Analytics & BI,Not Disclosed by Recruiter,Business Finance|Financial Management|Report G...,Any Graduate - Any Specialization
5,Skan Consultancy Pvt. Ltd.,Data Analyst,,2 - 5 yrs,Analytics & Business Intelligence,Telecom/ISP,Develop Data Analysis Generate Business Report...,https://www.naukri.com/data-analyst-jobs-in-india,Mumbai,,Analytics & BI,Not Disclosed by Recruiter,data analysis|reporting tools|analysis|data mi...,Any Graduate - Any Specialization
6,2Coms Consulting Pvt Ltd.,Data Analyst,Doctorate Not Required,1 - 3 yrs,Analytics & Business Intelligence,IT-Software / Software Services,"Dear Job Seekers, Greetings from 2 Com'...",https://www.naukri.com/data-analyst-jobs-in-india,Bengaluru(Singasandra),,Analytics & BI,"INR 4,00,000 - 4,50,000 P.A.",Data Analysis|SQL|Advanced Excel|Macros|Query|...,"Any Graduate - Any Specialization, B.Tech/B.E...."
7,Risk Resources,Analytics Manager,Doctorate Not Required,2 - 6 yrs,Analytics & Business Intelligence,KPO / Research / Analytics,Senior Analyst/Manager - Data Analytics - Fina...,https://www.naukri.com/data-analyst-jobs-in-india,Hyderabad,MBA/PGDM - Finance,Analytics & BI,Not Disclosed by Recruiter,financial research|finance|fundamental researc...,Any Graduate - Any Specialization
8,Iflate Softsystems India Private Limited,Fresher,,1 - 5 yrs,"ITES , BPO , KPO , LPO , ...",BPO / Call Centre / ITES,Must have excellent oral & written skills\r\nE...,https://www.naukri.com/data-analyst-jobs-in-india,Hyderabad,,Other,"INR 1,00,000 - 5,00,000 P.A. Salary + Medic...",call center|voice process|international bpo|bp...,Any Graduate - Any Specialization
9,FIL India Business and Research Services Priva...,Data Analyst,"Ph.D - Computers, Economics, Finance, Maths, S...",3 - 6 yrs,Analytics & Business Intelligence,Banking / Financial Services / Broking,Role Description: This role involves working ...,https://www.naukri.com/data-analyst-jobs-in-india,Bengaluru,"M.A - Maths, Statistics, Economics, MS/M.Sc(Sc...",Analytics & BI,Not Disclosed by Recruiter,Text Mining|C|R|Data Mining|Data Science|Machi...,Any Graduate - Any Specialization
