In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd

### I wanted to use Apache Spark for this task to make the processing very scalable and using all CPU cores of my machine for parallel processing

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext("local[11]", "knotebook")
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
sc

In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import * 

### This loads all job postings in an Spark Dataframe

The job postings where downloaded to *data/html_job_postings/html_job_postings*

In [5]:
text_files = sc.wholeTextFiles("data/html_job_postings/html_job_postings/*")
sdf = text_files.repartition(10).toDF().withColumnRenamed("_1", "filename").withColumnRenamed("_2", "content")

### An user defined function is applied on the data frame to use beautiful soup to extract information from the HTML content

In [6]:
def extractTitleBodyBullets(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    bullets = [elem.text.strip() for elem in soup.select('li')]
    return (str(soup.title.string), str(soup.body.getText()), bullets)

extractTitleBodyBullets_udf = udf(
    extractTitleBodyBullets,
    StructType([
    StructField("title", StringType(), True),
    StructField("body", StringType(), False),
    StructField("bullets", ArrayType(StringType()), False)
])
)

In [7]:
sdf_new = sdf.select("*", extractTitleBodyBullets_udf(sdf['content'])).select('extractTitleBodyBullets(content).*')

### Now the Spark Dataframe is easily converted to Pandas for further processing

In [8]:
df = sdf_new.toPandas()

In [9]:
len(df)

1336

### Remove the duplicates and check the impact

In [10]:
df.drop_duplicates(subset ="body", 
                     keep = False, inplace = True) 

In [11]:
len(df)

1320

### Filter the jobs to only include data science jobs

In [17]:
df = df[df['title'].str.lower().str.contains("data scient")]

In [18]:
len(df)

403

In [19]:
df

Unnamed: 0,title,body,bullets
1,"V.I.E. - Data Scientist - Charlotte, NC","V.I.E. - Data Scientist - Charlotte, NC\nAmeri...",[To improve our methods and tools for Machine ...
2,"(Entry-Level) Data Scientist - Chicago, IL","(Entry-Level) Data Scientist - Chicago, IL\nDa...",[\nBe the go-to person for Data ingest and sto...
8,"Data Scientist - Issaquah, WA","Data Scientist - Issaquah, WA\nJob Details\nLe...",[Serve as a subject matter expert in Data Scie...
12,"IT Data Scientist - Contract - Riverton, UT","IT Data Scientist - Contract - Riverton, UT\nP...","[Master’s degree, PhD degree preferred, 12+ ye..."
14,"Data Scientist - Plano, TX","Data Scientist - Plano, TX\nOverview\nPosition...","[Excellent visual, written and verbal communic..."
...,...,...,...
1316,"Insurance Data Scientist - Chicago, IL 60661","Insurance Data Scientist - Chicago, IL 60661\n...",[You come in with 1-2 years of professional st...
1317,"Senior Data Scientist - San Francisco, CA 94110","Senior Data Scientist - San Francisco, CA 9411...","[Build more affordable products, Bring them to..."
1326,"Senior Data Scientist - San Francisco, CA","Senior Data Scientist - San Francisco, CA\nJob...",[Perform hands-on analysis of large volumes of...
1330,"Data Scientist - San Francisco, CA 94103","Data Scientist - San Francisco, CA 94103\nWant...",[\nApply statistics techniques to improve Wish...


### Save the DataFrame to disk so we can load it at a later time for future parts of the project.

In [24]:
df.to_pickle("datascience_jobs.pkl") 

it can later be retrieved via * pd.read_pickle("datascience_jobs.pkl") * 