In [1]:
import csv
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Build up my corpus using the scraped data

Data was scraped using the "API_Caller.ipynb" notebook. You can find all the scraped data in the "collected_data" folder. 

In [2]:
# specify resume path.

resumePath = "resumes/jacky.txt"

In [3]:
# read in all the job postings

corpusDF = pd.DataFrame()

for f in os.listdir("collected_data/"):
    if (f.endswith(".csv")):
        
        print("Importing... ", f)
        
        filepath = os.path.join("collected_data", f)
        
        df = pd.read_csv(filepath, encoding = "ISO-8859-1")
        
        df["AllDescription"] = df["PositionTitle"] + " | " +  \
                                df["URI"] + " | " + df["DepartmentName"]  + " | " + \
                                df["OrganizationName"] + " | " + f  + " | " + \
                                df["JobSummary"] + " | " + \
                                df["QualificationSummary"]
        
        corpusDF = corpusDF.append(df, ignore_index = True)        
        
        print("Success")
        print("================================================================")

Importing...  analysis_N_500_20171223141842.csv
Success
Importing...  analyst_N_239_20171223115544.csv
Success
Importing...  attorney_N_180_20171223141257.csv
Success
Importing...  businessintelligence_N_20_20171223115447.csv
Success
Importing...  computational_N_17_20171223115812.csv
Success
Importing...  covert_N_4_20171223141634.csv
Success
Importing...  cyber_N_76_20171223141650.csv
Success
Importing...  dataanalyst_N_96_20171223115356.csv
Success
Importing...  dataanalytics_N_19_20171223115427.csv
Success
Importing...  database_N_500_20171223120145.csv
Success
Importing...  dataengineer_N_101_20171223115828.csv
Success
Importing...  datamining_N_9_20171223120312.csv
Success
Importing...  datascience_N_346_20171223115408.csv
Success
Importing...  datascientist_N_28_20171223115414.csv
Success
Importing...  datavisualization_N_14_20171223120520.csv
Success
Importing...  data_N_500_20171223115517.csv
Success
Importing...  developer_N_6_20171223120053.csv
Success
Importing...  economis

Success
Importing...  technology_N_500_20171223120810.csv
Success


In [4]:
corpusDF.head(2)

Unnamed: 0,ApplicationCloseDate,DepartmentName,JobGrade,JobSummary,MaxPay,MinPay,OfferingType,OrganizationName,PayType,PositionEndDate,PositionID,PositionLocation,PositionStartDate,PositionTitle,QualificationSummary,URI,AllDescription
0,2018-03-31,Department of the Air Force,GS,The mission of the United States Air Force is ...,155073.0,32844.0,Multiple Appointment Types,Air Force Personnel Center,Per Year,2018-03-31,AFPC-ACQEHA-10081352-0560,"Gunter AFB, Alabama|Edwards AFB, California|El...",2017-12-01,Budget Analysis,SPECIALIZED EXPERIENCE:\r\nGS-05: At least 3 y...,https://www.usajobs.gov:443/GetJob/ViewDetails...,Budget Analysis | https://www.usajobs.gov:443/...
1,2018-03-31,Department of the Air Force,GS,The mission of the United States Air Force is ...,155073.0,32844.0,Multiple Appointment Types,Air Force Personnel Center,Per Year,2018-03-31,AFPC-FMEDHA-10085353-0560,"Eielson AFB, Alaska|Elmendorf AFB, Alaska|Gunt...",2017-12-01,BUDGET ANALYSIS,SPECIALIZED EXPERIENCE:\r\nGS-05: At least 3 y...,https://www.usajobs.gov:443/GetJob/ViewDetails...,BUDGET ANALYSIS | https://www.usajobs.gov:443/...


In [5]:
len(corpusDF)

14406

### There may be multiple job postings. I'll need to remove the duplicates based on the job URI. 

In [6]:
corpusDF = corpusDF.groupby("URI").first()

len(corpusDF)

5441

In [7]:
corpusDF.head(2)

Unnamed: 0_level_0,ApplicationCloseDate,DepartmentName,JobGrade,JobSummary,MaxPay,MinPay,OfferingType,OrganizationName,PayType,PositionEndDate,PositionID,PositionLocation,PositionStartDate,PositionTitle,QualificationSummary,AllDescription
URI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
https://www.usajobs.gov:443/GetJob/ViewDetails/345265800,2018-02-15,Other Agencies and Independent Organizations,GS,Thisfile serves as a repository of resumes for...,82642.0,20527.0,Multiple Appointment Types,Office of Personnel Management,Per Year,2018-02-15,US OPM Veteran File,"San Francisco, California|Denver, Colorado|Was...",2017-02-16,U.S. OPM Veteran File,As positions become available and hiring offic...,U.S. OPM Veteran File | https://www.usajobs.go...
https://www.usajobs.gov:443/GetJob/ViewDetails/357639200,2018-01-03,Department of the Air Force,WG,Air Reserve Technicians are part of the regula...,26.18,22.45,Multiple Appointment Types,"Headquarters, Air Force Reserve Command",Per Hour,2018-01-03,SEU-SR-Elec Int Sys Mech,"Elmendorf AFB, Alaska|Maxwell AFB, Alabama|Lit...",2017-01-04,Electronic Integrated System Mechanic (Militar...,A specific length of experience or training is...,Electronic Integrated System Mechanic (Militar...


In [8]:
corpus = []
corpusDF["AllDescription"].apply(lambda row: corpus.append(row))
len(corpus)

5441

In [9]:
corpus[300][:1000]

'Physician (Emergency Medicine) | https://www.usajobs.gov:443/GetJob/ViewDetails/469718300 | Department of Veterans Affairs | Veterans Affairs, Veterans Health Administration | medical_N_500_20171223141417.csv | Salem VA Medical Center, an H-1B cap exempt, tertiary referral center is currently\r\nrecruiting interested Emergency Medicine practitioners to join its experienced\r\nstaff of physicians and health professionals as a staff physician in the\r\nDepartment of Medicine at the Medical Center. In addition to our continuing commitment to\r\nquality Veteran care, Salem VA Medical Center is also a teaching hospital and\r\nresearch center, maintaining both an active R&amp;D program encompassing multiple\r\nspecialty practices as well as three active medical school affiliations with\r\nthe University of Virginia School of Medicine, the Edward Via Virginia College\r\nof Osteopathic Medicine, and the Virginia Tech Carilion School of Medicine. The VA Medical Center\r\nof Salem, Virginia is 

## Prepend my resume onto the corpus list

In [10]:
file = open(resumePath, "r")
resume = file.read()

# preprend to corpus
corpus.insert(0, resume)

In [11]:
# make sure that the first entry is the resume

corpus[0][:1000]

'Relevant Coursework: Machine Learning, High Performance Computing, Predictive Analytics, Advanced Statistics, Natural Language Processing University of Michigan                                        Ann Arbor, MI Bachelor of Science: Industrial and Operations EngineeringDate of Graduation: April 2011Relevant Coursework: Software Development, Optimization Methods, Linear Statistical Models, Markov Process, Operations Modeling, Statistics ASQ Certified Six Sigma Black BeltLicense Number: 12771 WORK EXPERIENCE: Accenture                                                            Austin, TX  Senior Software Development Engineer                                                                       May 2014 – Present  ?Developed and maintained healthcare web applications (.NET MVC C#, HTML, Javascript, CSS, SQL Server) for the Texas Health and Human Services Commission (HHSC) to process Medicaid claims, patient forms, and physician inventory status. Collaborated with clients to translate b

## Build TF-IDF vectorizer and matrix

In [12]:
tf = TfidfVectorizer(analyzer = "word", 
                        ngram_range = (1, 3),
                        min_df = 0, 
                        stop_words = "english")

In [13]:
matrix = tf.fit_transform(corpus)

## Find similar jobs using cosine similarity

In [14]:
def find_similar(matrix, index, top_n = 10):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

## Let's test this out

In [15]:
k = 0

print("RESUME:\n")
print(corpus[k][:150000])

print("\n============================================================================\n")
print("MATCHED JOBS: \n")
for index, score in find_similar(matrix, k):
    print("\n")
    print(score, corpus[index][:3000])
    print("\n...next job...")

RESUME:

Relevant Coursework: Machine Learning, High Performance Computing, Predictive Analytics, Advanced Statistics, Natural Language Processing University of Michigan                                        Ann Arbor, MI Bachelor of Science: Industrial and Operations EngineeringDate of Graduation: April 2011Relevant Coursework: Software Development, Optimization Methods, Linear Statistical Models, Markov Process, Operations Modeling, Statistics ASQ Certified Six Sigma Black BeltLicense Number: 12771 WORK EXPERIENCE: Accenture                                                            Austin, TX  Senior Software Development Engineer                                                                       May 2014 – Present  ?Developed and maintained healthcare web applications (.NET MVC C#, HTML, Javascript, CSS, SQL Server) for the Texas Health and Human Services Commission (HHSC) to process Medicaid claims, patient forms, and physician inventory status. Collaborated with clients to tra