In [252]:
import csv
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Build up my corpus using the scraped data

In [253]:
# specify resume path

resumePath = "/resumes/jacky.txt"

In [254]:
# read in all the job postings

corpusDF = pd.DataFrame()

for f in os.listdir("collected_data/"):
    if (f.endswith(".csv")):
        print("Importing... ", f)
        filepath = os.path.join("collected_data", f)
        df = pd.read_csv(filepath, encoding = "ISO-8859-1")
        df["AllDescription"] = df["PositionTitle"] + " | " +  df["DepartmentName"]  + " | " + \
                                df["OrganizationName"]  + " | " +  df["URI"] + " | " + f  + " | " + \
                                df["JobSummary"] + " | " + \
                                df["QualificationSummary"]
        
        corpusDF = corpusDF.append(df, ignore_index = True)        
        
        print("Success")
        print("================================================================")

Importing...  analyst_N_239_20171223115544.csv
Success
Importing...  businessintelligence_N_20_20171223115447.csv
Success
Importing...  computational_N_17_20171223115812.csv
Success
Importing...  dataanalyst_N_96_20171223115356.csv
Success
Importing...  dataanalytics_N_19_20171223115427.csv
Success
Importing...  database_N_500_20171223120145.csv
Success
Importing...  dataengineer_N_101_20171223115828.csv
Success
Importing...  datamining_N_9_20171223120312.csv
Success
Importing...  datascience_N_346_20171223115408.csv
Success
Importing...  datascientist_N_28_20171223115414.csv
Success
Importing...  datavisualization_N_14_20171223120520.csv
Success
Importing...  data_N_500_20171223115517.csv
Success
Importing...  developer_N_6_20171223120053.csv
Success
Importing...  economist_N_28_20171223115942.csv
Success
Importing...  Engineer_N_288_20171223120400.csv
Success
Importing...  excel_N_103_20171223120652.csv
Success
Importing...  geospatial_N_33_20171223115329.csv
Success
Importing...  ha

In [255]:
corpusDF.head(2)

Unnamed: 0,ApplicationCloseDate,DepartmentName,JobGrade,JobSummary,MaxPay,MinPay,OfferingType,OrganizationName,PayType,PositionEndDate,PositionID,PositionLocation,PositionStartDate,PositionTitle,QualificationSummary,URI,AllDescription
0,2018-04-02,Other Agencies and Independent Organizations,GS,Construction Analysts (Loss Verifiers)\r\nNeed...,52329.0,35359.0,"Excepted Service - Temporary NTE 6 months, but...",Small Business Administration,Per Year,2018-04-02,DVC-HH-01-2017,"Seattle, Washington|Los Angeles, California|Sa...",2017-10-02,Construction Analyst,Qualifying experience includes: Working for a ...,https://www.usajobs.gov:443/GetJob/ViewDetails...,Construction Analyst | Other Agencies and Inde...
1,2017-12-25,Legislative Branch,HS,The Financial Analyst is a senior level positi...,94158.0,74726.0,Permanent,House of Representatives,Per Year,2017-12-25,FIN-010-17,"Washington DC, District of Columbia",2017-12-11,Financial Analyst,Highly qualified candidates will be evaluated ...,https://www.usajobs.gov:443/GetJob/ViewDetails...,Financial Analyst | Legislative Branch | House...


In [256]:
len(corpusDF)

7916

### There may be multiple job postings. I'll need to remove the duplicates based on the job URI. 

In [257]:
corpusDF = corpusDF.groupby("URI").first()

len(corpusDF)

3751

In [258]:
corpusDF.head(2)

Unnamed: 0_level_0,ApplicationCloseDate,DepartmentName,JobGrade,JobSummary,MaxPay,MinPay,OfferingType,OrganizationName,PayType,PositionEndDate,PositionID,PositionLocation,PositionStartDate,PositionTitle,QualificationSummary,AllDescription
URI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
https://www.usajobs.gov:443/GetJob/ViewDetails/345265800,2018-02-15,Other Agencies and Independent Organizations,GS,Thisfile serves as a repository of resumes for...,82642.0,20527.0,Multiple Appointment Types,Office of Personnel Management,Per Year,2018-02-15,US OPM Veteran File,"San Francisco, California|Denver, Colorado|Was...",2017-02-16,U.S. OPM Veteran File,As positions become available and hiring offic...,U.S. OPM Veteran File | Other Agencies and Ind...
https://www.usajobs.gov:443/GetJob/ViewDetails/357639200,2018-01-03,Department of the Air Force,WG,Air Reserve Technicians are part of the regula...,26.18,22.45,Multiple Appointment Types,"Headquarters, Air Force Reserve Command",Per Hour,2018-01-03,SEU-SR-Elec Int Sys Mech,"Elmendorf AFB, Alaska|Maxwell AFB, Alabama|Lit...",2017-01-04,Electronic Integrated System Mechanic (Militar...,A specific length of experience or training is...,Electronic Integrated System Mechanic (Militar...


In [259]:
corpus = []
corpusDF["AllDescription"].apply(lambda row: corpus.append(row))
len(corpus)

3751

In [260]:
corpus[300][:1000]

'DHA RECENT GRADUATES PROGRAM (INFORMATION TECHNOLOGY SPECIALIST) | Department of the Air Force | Air Force Personnel Center | https://www.usajobs.gov:443/GetJob/ViewDetails/479963100 | operations_N_500_20171223120113.csv | Any individual who was awarded a degree by an institution of higher\r\neducation from a public or other non-profit institutionnot more than two years before the date of appointment meets the\r\nbasic eligibility requirement to apply for the DHA Recent Graduates Program; OR Any individual who has completed a period of obligated service in a\r\nuniformed service of more than four years and was awarded a degree by an\r\ninstitution of higher education from a public or other non-profit institutionnot more than four years before the date of\r\nappointment meets the basic eligibility requirement to apply for the DHA Recent\r\nGraduates Program. YOUR TRAINING AND DEVELOPMENT\r\nCOVERS: &#183; Orientation program &#183; Mentorship throughout the\r\nProgram &#183; Individual

## Prepend my resume onto the corpus list

In [261]:
file = open("resumes/jacky.txt", "r")
resume = file.read()

# preprend to corpus
corpus.insert(0, resume)

In [262]:
# make sure that the first entry is the resume

corpus[0][:1000]

'Relevant Coursework: Machine Learning, High Performance Computing, Predictive Analytics, Advanced Statistics, Natural Language Processing University of Michigan                                        Ann Arbor, MI Bachelor of Science: Industrial and Operations EngineeringDate of Graduation: April 2011Relevant Coursework: Software Development, Optimization Methods, Linear Statistical Models, Markov Process, Operations Modeling, Statistics ASQ Certified Six Sigma Black BeltLicense Number: 12771 WORK EXPERIENCE: Accenture                                                            Austin, TX  Senior Software Development Engineer                                                                       May 2014 – Present  ?Developed and maintained healthcare web applications (.NET MVC C#, HTML, Javascript, CSS, SQL Server) for the Texas Health and Human Services Commission (HHSC) to process Medicaid claims, patient forms, and physician inventory status. Collaborated with clients to translate b

## Build TF-IDF vectorizer and matrix

In [263]:
tf = TfidfVectorizer(analyzer = "word", 
                        ngram_range = (1, 3),
                        min_df = 0, 
                        stop_words = "english")

In [264]:
matrix = tf.fit_transform(corpus)

## Find similar jobs using cosine similarity

In [265]:
def find_similar(matrix, index, top_n = 10):
    cosine_similarities = linear_kernel(matrix[index: index + 1], matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

## Let's test this out

In [271]:
k = 0

print("RESUME:\n")
print(corpus[k][:150000])

print("\n============================================================================\n")
print("MATCHED JOBS: \n")
for index, score in find_similar(matrix, k):
    print("\n")
    print(score, corpus[index][:3000])
    print("\n...next job...")

RESUME:

Relevant Coursework: Machine Learning, High Performance Computing, Predictive Analytics, Advanced Statistics, Natural Language Processing University of Michigan                                        Ann Arbor, MI Bachelor of Science: Industrial and Operations EngineeringDate of Graduation: April 2011Relevant Coursework: Software Development, Optimization Methods, Linear Statistical Models, Markov Process, Operations Modeling, Statistics ASQ Certified Six Sigma Black BeltLicense Number: 12771 WORK EXPERIENCE: Accenture                                                            Austin, TX  Senior Software Development Engineer                                                                       May 2014 – Present  ?Developed and maintained healthcare web applications (.NET MVC C#, HTML, Javascript, CSS, SQL Server) for the Texas Health and Human Services Commission (HHSC) to process Medicaid claims, patient forms, and physician inventory status. Collaborated with clients to tra