In [1]:
import re
from pathlib import Path
import pandas as pd
import bs4
from bs4 import BeautifulSoup

In [2]:
# Saves all the downloaded html file names to the list
dir = Path.cwd()
file_names = list(dir.glob("Indeed files/*.htm"))
len(file_names)

3195

In [None]:
# Removes error files from the list of file names
for each in file_names:
    with open(each, "r", encoding='utf-8') as file:
        content = BeautifulSoup(file.read(), 'html.parser')
        text = content.get_text()
        text = text.strip()
        match1 = re.match(r".*Not found.*" ,text)
        match2 = re.match(r".*Error.*", text)
        if match1 != None or match2 != None:
            print(each)
            file_names.remove(each)

In [4]:
len(file_names)

3116

In [5]:
def get_title(content):
    '''Extracts the job title from the postings using 4 different tags from the html'''
    title = content.find("h3", class_ = "icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title")
    if title != None:
        job_info_title = title.text
    else:
        title = content.find("h1", class_ = "subhead")
        if title != None:
            job_info_title = title.text
        else:
            title = content.find("div", class_ = "job-header")
            if title != None:
                job_info_title = title.find("h1").text
            else:
                job_info_title = "Not Found"
    return job_info_title

In [6]:
def get_job_info(content):
    '''Extracts the information on company name and location from the html based on4 different tags'''
    loc = content.find("div", class_ = "jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating")
    if loc != None:
        job_info_loc = loc.text
    else:
        loc = content.find("div", class_ = "jobsearch-CompanyInfoWithoutHeaderImage jobsearch-CompanyInfoWithReview")
        if loc != None:
            children = loc.findChildren()
            company = children[1].text
            reviews = children[2].text
            location = children[len(children)-1].text
            job_info_loc = company + "," + reviews + "-" + location
        else:
            loc = content.find("div", class_ = "location")
            if loc != None:
                job_info_loc = " , -" + loc.text.strip()
            else:
                loc = content.find("div", class_ = "jobsearch-CompanyInfoWithoutHeaderImage")
                if loc != None:
                    children = loc.findChildren()
                    company = children[0].text
                    location = children[1].text
                    job_info_loc = company + "," + " -" + location
                else:
                    job_info_loc = "Not found"
    return job_info_loc

In [7]:
def get_job_desc(content):
    '''Extracts the job posting information as one big blob of text from the html based on 2 different tags'''
    desc = content.find("div", class_ = "jobsearch-jobDescriptionText")
    if desc != None:
        job_desc = desc.text
    else:
        desc = content.find("div", class_ = "job-details span9")
        if desc != None:
            job_desc = desc.text
        else:
            job_desc = "Not found"
    return job_desc

In [8]:
def get_salary(content):
    '''Extracts Salary information if available from the html page based on <p> tag and regex pattern match'''
    flag = 0
    z = content.findAll("p")
    if z != None:
        for z1 in z:
            if re.match("Salary:.*", z1.text):
                flag = 1
                salary = z1.text
    if flag != 1:
        salary = "Not Found"
    return salary

In [10]:
job_info_title = []
job_info_loc = []
job_info_desc = []
job_info_sal = []
# Loops over all the files and uses functions to get job_title, job_location, job_description and salary information 
# from the html and stores them in a list
for each in file_names:
    with open(each, "r", encoding='utf-8') as file:
        content = BeautifulSoup(file.read(), 'html.parser')
        job_info_title.append(get_title(content))
        job_info_loc.append(get_job_info(content))
        job_info_desc.append(get_job_desc(content))
        job_info_sal.append(get_salary(content))

In [11]:
len(job_info_sal)

3116

In [12]:
# Extracting information (company name, no of reviews, location) from the location data using delimiters such as - and ,.
# If else is used to handle different kind of combinations available on the website. ex: BMC,124 reviews-SF,CA 91101 or Twitter-SF
job_info_company = []
job_info_location = []
job_info_reviews = []
for i in job_info_loc:
    info = i.rsplit("-", 1)
    if len(info) == 3:
        job_info_company.append(info[0])
        job_info_reviews.append(info[1])
        job_info_location.append(info[2])
    else:
        info2 = info[0].split(",")
        job_info_company.append(info2[0])
        if len(info2) > 1:
            job_info_reviews.append(info2[1])
        else:
            job_info_reviews.append("")
        if len(info) > 1:
            job_info_location.append(info[1])
        else:
            job_info_location.append("")

In [13]:
# Creates a dataframe by combining all the lists
job_details = pd.DataFrame(list(zip(job_info_title, job_info_company, job_info_location, job_info_reviews, job_info_desc, job_info_sal)), columns = ["Title", "Company", "Location", "Reviews", "Description", "Salary"])

In [14]:
# Tidying the reviews columns
job_details["Reviews"] = job_details["Reviews"].replace(r"[^0-9\n]*([0-9]*.*)", r"\1", regex = True)
# Extracts City from the location
job_details["City"] = job_details["Location"].replace(r"([a-zA-Z\s]+),.*", r"\1", regex = True)
# Extracts State from the location
job_details["State"] = job_details["Location"].replace(r"[a-zA-Z\s]+,\s([A-Z]{2}).*", r"\1", regex = True)
# Extracts Pincode from the location
job_details["Pincode"] = job_details["Location"].replace(r"[a-zA-Z\s]+,\s[A-Z]{2}\s?([0-9]+)?", r"\1", regex = True)
# Identify the records thave have company name and reviews combined and combine them with , as delimiter
job_details["Company"] = job_details["Company"].replace(r"([^0-9]*)([0-9]+\sreviews)", r"\1,\2", regex = True)

In [15]:
#Extract company name and reviews based on comma delimiter
for i in range(len(job_details)):
    if("," in job_details["Company"][i]):
        info = job_details["Company"][i].split(",")
        job_details["Company"][i] = info[0]
        job_details["Reviews"][i] = info[1]

In [16]:
# Drop location
job_details.drop(columns = ["Location"], inplace = True)

In [17]:
# Extract salary information from the text
job_details["Salary"] = job_details["Salary"].replace(r"Salary:\s(.*)", r"\1", regex = True)
job_details["Salary"] = job_details["Salary"].replace(r"(.*)\sannually.*", r"\1", regex = True)

In [18]:
# replace salary columns as - if not available
for i in range(len(job_details)):
    if job_details["Salary"][i] == "Not Found":
        job_details["Salary"][i] = "-"

In [19]:
# Set the source value as Indeed
job_details["Source"] = "Indeed"

In [20]:
# Write the results to a csv file
job_details.to_csv("indeed_data.csv")

In [21]:
len(job_details)

3116

In [50]:
# Combine the results from multiple csv files
job_listing1 = pd.read_csv("indeed_data_V.csv")
job_listing2 = pd.read_csv("indeed_data_K.csv")
job_listing3 = pd.read_csv("indeed_data_C.csv")
job_listing4 = pd.read_csv("indeed_data_N.csv")

In [51]:
len(job_listing1)

3116

In [52]:
len(job_listing2)

1931

In [53]:
len(job_listing3)

1052

In [54]:
len(job_listing4)

6155

In [55]:
# Combine the results into a singe dataframe
job_listings = pd.concat([job_listing1, job_listing2, job_listing3, job_listing4])

In [56]:
len(job_listings)

12254

In [57]:
job_listings.drop(['Unnamed: 0'], axis=1, inplace = True)
# Remove redundant rows
job_listings = job_listings.drop_duplicates().reset_index(drop=True)
job_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6011 entries, 0 to 6010
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        6011 non-null   object
 1   Company      6011 non-null   object
 2   Reviews      4560 non-null   object
 3   Description  6011 non-null   object
 4   Salary       6011 non-null   object
 5   City         6004 non-null   object
 6   State        6004 non-null   object
 7   Pincode      3168 non-null   object
 8   Source       6011 non-null   object
dtypes: object(9)
memory usage: 422.8+ KB


In [58]:
len(job_listings)

6011

In [45]:
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [61]:
# Removes non ascii characters
job_listings["Description"] = job_listings["Description"].apply(removeNonAscii)
job_listings["Title"] = job_listings["Title"].apply(removeNonAscii)
job_listings["Company"] = job_listings["Company"].apply(removeNonAscii)

In [62]:
# Create a csv file with the de-duplicated results
job_listings.to_csv("indeed_job_listings.csv")