<a href="https://colab.research.google.com/github/jianfeiZhao/Resume-Matching-System/blob/master/resume_match.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark
import nltk
nltk.download('stopwords')

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/27/67/5158f846202d7f012d1c9ca21c3549a58fd3c6707ae8ee823adcaca6473c/pyspark-3.0.2.tar.gz (204.8MB)
[K     |████████████████████████████████| 204.8MB 69kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 21.9MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186687 sha256=3e7a8151bd72c2ef174504108b44393c810e6310054dfdaed2daba632beca1a5
  Stored in directory: /root/.cache/pip/wheels/8b/09/da/c1f2859bcc86375dc972c5b6af4881b3603269bcc4c9be5d16
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2
[nlt

True

## Preprocessing the jobs data

In [26]:
import pandas as pd
from nltk.corpus import stopwords
import re

# load data
#df = pd.read_csv('./jobs_small.csv', encoding="latin-1")
df = pd.read_csv('/content/sample_data/resume_match/jobs.csv', encoding="utf-8")
#print(df.head())

# text preprocessing
REPLACE_BY_SPACE_RE = re.compile('[#+_/(){}!^?<>"''*\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
match_regex = re.compile('\d+')
STOPWORDS = set(stopwords.words('english'))

# data cleaning
def clean_text(text):
    # change to lower-csae
    text = str(text).lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    # remove BAD_SYMBOLS_RE
    text = BAD_SYMBOLS_RE.sub('', text)
    text = match_regex.sub('', text)
    # drop the stopwords
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    return text

# clean the desc field
df['desc_clean'] = df['description'].apply(clean_text)
df.drop(columns=['description', 'id'], inplace=True)

for i in range(len(df)):
  try:
    if df['desc_clean'][i]=='nan' or df['desc_clean'][i]=='' or len(df['desc_clean'][i]) < 100:
      df.drop(labels=i, inplace=True)
  except:
    continue

df.dropna(axis=0, inplace=True)
df['id'] = [i for i in range(1, len(df)+1)]
#print(df['desc_clean'])
df.to_csv('./jobs_clean.csv')

##load your CV and start matching from here

In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
#from pyspark.ml.feature import NGram

spark=SparkSession \
        .builder \
        .appName('tfidf_app') \
        .getOrCreate()

# read and clean the resume file
f = open('/content/sample_data/resume_match/CV.txt', 'r')       ############# change resume dir here ####################
text = f.read()
text = clean_text(text)
df = pd.read_csv('/content/sample_data/resume_match/jobs_clean.csv', encoding="utf-8")
df.drop(columns=['Unnamed: 0'], inplace=True)
df.loc[0] = ['resume', 0, 0, 0, text, 0]
df.to_csv('./jobs_clean.csv')

# load data
df0 = spark.read.csv("./jobs_clean.csv", header=True, multiLine=True, inferSchema=True)
df1 = pd.read_csv('./jobs_clean.csv')

#df0.show()
print('Total number of jobs：',df0.count()-1)
print('\nthe number of each distinct job:\n', df1.job.value_counts()[:-1])
print('\nThere are', len(df1.job.unique())-1, 'different kinds of jobs in the table.')

# split the desc field
tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words')
df = tokenizer.transform(df0)
#df.show()
#df.select('desc_words').show(10)

# compute TF-IDF
hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf')
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf)
tfidf = idf.transform(tf).cache()
#print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False))

# data normalization
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
#tfidf.select("id", "norm").show(6)

# compute similarity between jobs and resume
import pyspark.sql.functions as psf 
from pyspark.sql.types import DoubleType
print('\nCompute the similarity between jobs and resume...')
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType()) # define dot-product function
tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") == 0)\
        .select(
            psf.col("a1.job"),
            psf.col("a1.id").alias("id1"), 
            psf.col("a2.id").alias("id2"), 
            dot_udf("a1.norm", "a2.norm").alias("similarity"))
#tfidf.show(10)
print('Done!')

Total number of jobs： 14790

the number of each distinct job:
 DSP engineer                    420
computer vision engineer        409
FPGA Engineer                   392
data-scientist                  387
Ruby developer                  385
Machine Learning Engineer       379
PLC Technician                  379
web developer                   377
PHP developer                   373
python                          372
Software Product Manager        372
IOS Developer                   372
database administrator          370
Performance Test Engineer       367
NLP engineer                    367
Electrical Design Engineer      366
Test Automation Engineer        365
Embedded Systems Engineer       364
statistician                    363
computer support specialist     360
computer systems analyst        356
computer network architect      352
Android Developer               348
Node js developer               346
Python Software Engineer        345
Circuit Design Engineer         341
D

In [30]:
# show Top-20 matched jobs
match = tfidf.where('id1 = 0').sort('similarity', ascending=False).where('id2 > 0')
top_match = match.limit(20)
print('Top 20 matched jobs:')
df0.alias("a1").join(top_match.alias("a2"), psf.col("a1.id") == psf.col("a2.id2"))\
    .select(psf.col("a1.job"), "a1.company", "a1.location", "a2.similarity")\
    .sort('similarity', ascending=False).show()

Top 20 matched jobs:
+--------------------+------------+--------------------+-------------------+
|                 job|     company|            location|         similarity|
+--------------------+------------+--------------------+-------------------+
|        NLP engineer|       Apple|Seattle, Washingt...|0.10311775333742133|
|        NLP engineer|       Apple|Seattle, Washingt...|0.09924933857025435|
|        NLP engineer|       Apple|Seattle, Washingt...|0.09585500958328112|
|        NLP engineer|       Apple|Seattle, Washingt...|0.09318176036963881|
|      data-scientist|    SPECTRUM|    Golden, Colorado|0.09301087086596467|
|      Spark Engineer|    SPECTRUM| Englewood, Colorado|0.09301087086596467|
|       FPGA Engineer|    SPECTRUM|      Pine, Colorado|0.09301087086596467|
|Machine Learning ...|    SPECTRUM|Wheat Ridge, Colo...|0.09301087086596467|
|        NLP engineer|    SPECTRUM| Englewood, Colorado|0.09301087086596467|
|computer vision e...|       Apple|Seattle, Washingt...

In [31]:
match = df0.alias("a1").join(match.alias("a2"), psf.col("a1.id") == psf.col("a2.id2"))\
    .select(psf.col("a1.job"), "a1.company", "a1.location", "a2.similarity")\
    .sort('similarity', ascending=False)

# create SQL table
match.createOrReplaceTempView("match")

In [32]:
# start SQL query

# select jobs in specific location
df = spark.sql("SELECT * FROM match WHERE location like 'New York City%'")
#df = spark.sql("SELECT * FROM match WHERE location like 'San Francisco%'")
df.show()

+--------------------+--------------------+--------------------+--------------------+
|                 job|             company|            location|          similarity|
+--------------------+--------------------+--------------------+--------------------+
|Python Software E...|    Case Interactive|New York City, Ne...|0.044451186280879226|
|JavaScript Developer|    Case Interactive|New York City, Ne...|0.044451186280879226|
|              python|    Case Interactive|New York City, Ne...|0.043726558242579094|
|      Spark Engineer|        Apex Systems|New York City, Ne...| 0.03862278048733808|
|       FPGA Engineer|      Clarapath Inc.|New York City, Ne...|  0.0357684356616758|
|Telecommunication...|      Clarapath Inc.|New York City, Ne...|  0.0357684356616758|
|           Architect|     Beta Search Inc|New York City, Ne...| 0.03463789785614938|
|Python Software E...|     Beta Search Inc|New York City, Ne...| 0.03463789785614938|
|Java Software Eng...|     Beta Search Inc|New York Ci

In [7]:
#select specific jobs
#df = spark.sql("SELECT * FROM match where job = 'computer vision engineer'")
#df = spark.sql("SELECT * FROM match where job = 'FPGA Engineer'")
df = spark.sql("SELECT * FROM match where job = 'Embedded Systems Engineer'")

df.show()

+--------------------+--------------------+--------------------+--------------------+
|                 job|             company|            location|          similarity|
+--------------------+--------------------+--------------------+--------------------+
|Embedded Systems ...|Odyssey Systems C...|Lexington, Massac...| 0.05704758412902608|
|Embedded Systems ...|  Blue Star Software| Chantilly, Virginia| 0.05577960602990476|
|Embedded Systems ...|Southwest Researc...|  San Antonio, Texas| 0.05390175504834788|
|Embedded Systems ...|Odyssey Systems C...|Lexington, Massac...|0.053412161939692275|
|Embedded Systems ...|Southwest Researc...|  San Antonio, Texas| 0.04839396554216487|
|Embedded Systems ...|                PSEG|Hancocks Bridge, ...|0.043340746078394075|
|Embedded Systems ...|US ARMY Ground Ve...|WARREN, Michigan ...|0.042577557649579885|
|Embedded Systems ...|Southwest Researc...|  San Antonio, Texas| 0.04255284363161199|
|Embedded Systems ...|          CVS Health|Monroeville