In [1]:
from pathlib import Path
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
# Creates list of files names from the html files that we saved from monster website's listing
dir = Path.cwd()
file_names = list(dir.glob("Profiles/*.htm"))

In [3]:
len(file_names)

8794

In [4]:
# Loops over the list of filenames and removes the error files, if any
for each in file_names:
    with open(each, "r", encoding='utf-8') as file:
        content = BeautifulSoup(file.read(), 'html.parser')
        text = content.get_text()
        text = text.replace("\n", "")
        match1 = re.match(r".*The job you asked for is not available\..*" ,text)
        if match1 != None:
            file_names.remove(each)

In [None]:
job_info_title = []
job_info_company = []
job_info_location = []
job_info_desc = []
# Loops over each file name
for each in file_names:
    with open(each, "r", encoding='utf-8') as file:
        content = BeautifulSoup(file.read(), 'html.parser')
        # Get job title based on h1 tag
        title = content.find("h1", class_ = "job_title c-primary-dk")
        # Get company name using the div tag
        company = content.find("div", class_ = "job_company_name tag-line c-primary")
        # Get location using div tag
        location = content.find("div", class_ = "location c-gray-6")
        # Get job description using div tag
        desc = content.find("div", class_ = "job-description")
        # checks if the data is not none and then append it to the corresponding list
        if title != None:
            job_info_title.append(title.text)
        else:
            print(each)
            job_info_title.append("Not Found")
        if title != None:
            job_info_company.append(company.text)
        else:
            print(each)
            job_info_company.append("Not Found")
        if title != None:
            job_info_location.append(location.text)
        else:
            print(each)
            job_info_location.append("Not Found")
        if title != None:
            job_info_desc.append(desc.text)
        else:
            job_info_desc.append("Not Found")

In [7]:
len(job_info_desc)

8641

In [8]:
len(file_names)

8641

In [9]:
#Concat all the lists into a dataframe
job_details = pd.DataFrame(list(zip(job_info_title, job_info_company, job_info_location, job_info_desc)), columns = ["Title", "Company", "Location", "Description"])

In [10]:
job_details.head()

Unnamed: 0,Title,Company,Location,Description
0,"Data Analyst - Python, SQL",CyberCoders,"Seattle, WA",DescriptionIf you are a Data Analyst with expe...
1,Data Analyst I,Collabera,"Bellevue, WA","DescriptionBellevue, WashingtonSkills : data a..."
2,Senior Software Engineer - Java,Intelliswift Software,"San Francisco, CA 94105",DescriptionTitle: Senior Software EngineerPosi...
3,Data Analyst,Community Transit,"Everett, WA 98203","DescriptionLeads efforts to collect, organize,..."
4,Data Analyst,Rodo Inc.,"New York, NY",DescriptionJob DescriptionRodo is a car leasin...


In [11]:
#Creates city, state and zip code based on the locationinformation using regex
job_details["City"] = job_details["Location"].replace(r"([a-zA-Z\s]+),.*", r"\1", regex = True)
job_details["State"] = job_details["Location"].replace(r"[a-zA-Z\s]+,\s([A-Z]{2}).*", r"\1", regex = True)
job_details["Pincode"] = job_details["Location"].replace(r"[a-zA-Z\s]+,\s[A-Z]{2}\s?([0-9]+)?", r"\1", regex = True)
job_details["Description"] = job_details["Description"].replace(r"Description(.*)", r"\1", regex = True)

In [12]:
#Remove location information from the dataframe
job_details.drop(columns = ["Location"], inplace = True)

In [13]:
# Write the dataframe to a csv file
job_details.to_csv("monster_data_V.csv")

In [192]:
# Read from multiple csv files and create a dataframe for each
job_details1 = pd.read_csv("monster_data_V.csv")
job_details2 = pd.read_csv("monster_data_C.csv")
job_details3 = pd.read_csv("monster_data_K.csv")
job_details4 = pd.read_csv("monster_data_N.csv")

In [193]:
#Combines all the dataframes into one
job_details = pd.concat([job_details1, job_details2, job_details3, job_details4])

In [194]:
len(job_details)

27407

In [195]:
#Removes the index column
job_details.drop(['Unnamed: 0'], axis=1, inplace = True)
# Remove redundant rows
job_details = job_details.drop_duplicates().reset_index(drop=True)
job_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22308 entries, 0 to 22307
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        22308 non-null  object
 1   Company      22298 non-null  object
 2   Description  22308 non-null  object
 3   City         22249 non-null  object
 4   State        22249 non-null  object
 5   Pincode      8667 non-null   object
dtypes: object(6)
memory usage: 1.0+ MB


In [196]:
len(job_details)

22308

In [197]:
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [198]:
# Removes non ascii characters
job_details["Description"] = job_details["Description"].apply(removeNonAscii)

In [201]:
# Replicate the Description column as Salary
job_details["Salary"] = job_details["Description"]

In [202]:
job_details.head()

Unnamed: 0,Title,Company,Description,City,State,Pincode,Salary
0,"Data Analyst - Python, SQL",CyberCoders,"If you are a Data Analyst with experience, ple...",Seattle,WA,,"If you are a Data Analyst with experience, ple..."
1,Data Analyst I,Collabera,"Bellevue, WashingtonSkills : data analystDescr...",Bellevue,WA,,"Bellevue, WashingtonSkills : data analystDescr..."
2,Senior Software Engineer - Java,Intelliswift Software,Title: Senior Software EngineerPosition Type: ...,San Francisco,CA,94105.0,Title: Senior Software EngineerPosition Type: ...
3,Data Analyst,Community Transit,"Leads efforts to collect, organize, and mainta...",Everett,WA,98203.0,"Leads efforts to collect, organize, and mainta..."
4,Data Analyst,Rodo Inc.,Job DescriptionRodo is a car leasing startup t...,New York,NY,,Job DescriptionRodo is a car leasing startup t...


In [None]:
#Get the salary information from the Description usng Regex patterns and update it in the dataframe
for i in range(len(job_details)):
    desc = job_details.iloc[i, 6]
    match = re.match(".*[Ss]alary\s-\s([,+$0-9to\skK]+).*", desc)
    if match != None:
        job_details.iloc[i, 6] = match[1]
        print("1: {0} - {1}".format(i,job_details.iloc[i, 6]))
    else:
        match1 = re.match(".*[Ss]alary:([-,+$0-9toperhu\s\/Analy.KkHY]+).*", desc)
        if match1 != None:
            job_details.iloc[i, 6] = match1[1]
            print("2: {0} - {1}".format(i,job_details.iloc[i, 6]))
        else:
            job_details.iloc[i, 6] = "-"

In [205]:
job_details["Source"] = "Monster"

In [206]:
# Write the resulting dataframe to a csv file
job_details.to_csv("moster_data_combined.csv")