In [5]:
# Import packages
import pandas as pd
import glob
import json
import re
import collections
import numpy as np
import sparse_dot_topn.sparse_dot_topn as ct
import time
from functools import reduce
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix

In [7]:
# Import Resume Dataset
# Set Folder Path
folder = 'D:\\GWU\\Spring 2019\\DATS 6202\\Dataset\\resume_part1\\'
# Use glob to list all the txt files
files = [f for f in glob.glob(folder + "**/*.txt", recursive=True)]

In [10]:
# List of all the txt files for corresponding positions
files_ds=[files[x] for x in range(len(files)) if files[x].rsplit('\\',2)[1].rsplit('_',1)[0]=='Data Scientist']
print("All Data Scientist Resumes",len(files_ds))

files_DC=[files[x] for x in range(len(files)) if files[x].rsplit('\\',2)[1].rsplit('_',1)[1] == 'Washington, DC']
print("Total Washington, DC Resumes",len(files_DC))

files_DC_ds=[files_DC[x] for x in range(len(files_DC)) if files_DC[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Data Scientist']
print("DC Data Scientist Resumes",len(files_DC_ds))

files_DC_da=[files_DC[x] for x in range(len(files_DC)) if files_DC[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Database Administrator']
print("DC Database Administrator Resumes",len(files_DC_da))

files_DC_fa=[files_DC[x] for x in range(len(files_DC)) if files_DC[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Financial Analyst']
print("DC Financial Analyst Resumes",len(files_DC_fa))

files_DC_itm=[files_DC[x] for x in range(len(files_DC)) if files_DC[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'IT Manager']
print("DC IT Manager",len(files_DC_itm))

files_DC_csa=[files_DC[x] for x in range(len(files_DC)) if files_DC[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Computer Systems Analyst']
print("DC Computer Systems Analyst",len(files_DC_csa))

files_DC_csad=[files_DC[x] for x in range(len(files_DC)) if files_DC[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Computer Systems Administrator']
print("DC Computer Systems Administrator",len(files_DC_csad))

files_da=[files[x] for x in range(len(files)) if files[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Database Administrator']
print("All Database Administrator Resumes",len(files_da))

files_csad=[files[x] for x in range(len(files)) if files[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'Computer Systems Administrator']
print("All Computer Systems Administrator",len(files_csad))

files_im=[files[x] for x in range(len(files)) if files[x].rsplit('\\',2)[1].rsplit('_',1)[0] == 'IT Manager']
print("All IT Manager",len(files_im))

All Data Scientist Resumes 18467
Total Washington, DC Resumes 214950
DC Data Scientist Resumes 1298
DC Database Administrator Resumes 5503
DC Financial Analyst Resumes 21058
DC IT Manager 19181
DC Computer Systems Analyst 3048
DC Computer Systems Administrator 2171
All Database Administrator Resumes 53613
All Computer Systems Administrator 20550
All IT Manager 257602


In [8]:
# Extract and print the Job Category from File Name
Category=pd.Series(files[x].rsplit('\\',2)[1].rsplit('_',1)[0] for x in range(len(files))).value_counts()
print(Category)    

Accountant                        586003
Marketing Manager                 418090
Financial Manager                 400268
Financial Analyst                 341882
Management Analyst                281921
IT Manager                        257602
Business Operations Manager       186300
Construction Manager              160471
Loan Officer                      146084
Laboratory Technician             116424
Financial Advisor                 101380
Interpreter                        96441
HR Specialist                      91969
High School Teacher                60590
Compliance Officer                 56259
Database Administrator             53613
Computer Support Specialist        38760
Civil Engineer                     27964
Fundraiser                         26972
Computer Systems Analyst           26736
Information Security Analyst       22968
Computer Systems Administrator     20550
Data Scientist                     18467
Lawyer                             14032
Cost Estimator  

In [11]:
# Build DS resume table
resumes=[]
for filename in files_ds[0:]:
    with open(filename,encoding="utf8") as f:  
        resume_dict = json.load(f)  ## data is a dictionary that contains the JSON info
        resumes= resumes+ [resume_dict]

In [12]:
# Extract all the titles       
titles=[]
p=[]
for i in range(len(resumes)):
    if any ('experience_history' in s for s in list(resumes[i].keys())):
        for j in resumes[i]['experience_history']:
            if any ('title' in s for s in list(j.keys())):
                titles.append(j['title'])

In [13]:
# Change all the titles to lowercase
titles=[x.lower() for x in titles]

In [14]:
# Get unique titles, where strip() is used to remove all the leading and trailing spaces from a string.
uniquetitle = reduce(lambda l, x: l if x in l else l+[x.strip()], titles, [])

In [15]:
# Clean the unique title 
cleaned_title=[]
for i in range(len(uniquetitle)):
        cleaned_title.append(re.sub(r'\(\w+\s?','',re.sub(r'\w+\)\s?','',re.sub(r'\s?\([^)]*\)\s?', '', re.sub(r'\s?\"[^)]*\"\s?', '', uniquetitle[i].lower())))).replace(' & ',', ').replace(' & ',', ').replace('&',', ').replace(' / ',', ').replace('/ ',', ').replace('/',', ').replace(' and ',', ').replace(':',', ').replace(';',', ').replace('•',', ').replace('"','').replace('.',', ').replace('ø','').replace('','').replace('-',', ').split(', '))

In [16]:
# Flat the list and remove empty set
flat_list=[]
flat_list = [item.strip() for sublist in cleaned_title for item in sublist]    
while('' in flat_list) : 
    flat_list.remove('') 

In [17]:
# Unique Flat List
flat_list_uni = reduce(lambda l, x: l if x in l else l+[x], flat_list, [])

In [18]:
# Ngrams   
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [19]:
# Use Ngrams to Generate TF-IDF Matrix
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(flat_list_uni)

In [20]:
# Check the location for scientist
for i in range(len(flat_list_uni)):
    if flat_list_uni[i]=='scientist':
        print (i)

245


In [21]:
# Print the TF-IDF matrix for scientist
print(tf_idf_matrix[245])

  (0, 5053)	0.40092316243435583
  (0, 1469)	0.39989350433918536
  (0, 2869)	0.3912290470650775
  (0, 2076)	0.30426997984598536
  (0, 4014)	0.41031624629957697
  (0, 5441)	0.4108159672001371
  (0, 3045)	0.3107671704412755


In [22]:
# Cosine Similarity
# Compressed Sparse Row (CSR) Matrix
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [23]:
# Calculate and print the time used to compute CSR Matrix
# It stores the top 100 most similar items, and only show items with similiarity above 0.85
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 100, 0.85)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 5.957417011260986


In [25]:
# Create the Matched dataframe, and calculated the similarity value
def get_matches_df(sparse_matrix, name_vector, top=10000):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [26]:
# Matched String Similarity   
matches_df = get_matches_df(matches, flat_list_uni, top=5371)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.sample(20,replace=True)


Unnamed: 0,left_side,right_side,similairity
3688,systems programmer,senior systems programmer,0.863102
268,programmer analyst,data programmer analyst,0.878023
1996,competitive intelligence,competitive intelligence analyst,0.923674
928,mathematics,of mathematics,0.886275
3217,product development,product development scientist,0.858567
2596,design engineer,data design engineer,0.876545
1598,machine learning,senior machine learning,0.867457
3093,data reviewer,data review,0.854878
3492,business data analyst,business data analyst intern,0.868709
2644,quality control chemist,quality control senior chemist,0.852782


In [28]:
# Save a csv file for macthed similarity
matches_df.to_csv(path_or_buf='D:\\GWU\\Spring 2019\\DATS 6202\\Python\\Job & Resume\\matches.csv',index=False)

In [29]:
# Get title count
counter=collections.Counter(flat_list)

In [30]:
# Use count as frequency and Covert Count to Dict
kk=pd.DataFrame()
kk['Titles']=counter.keys()
kk['Count']=counter.values()
kk=kk.sort_values(by=['Count'], ascending=False)           
kk_dict=kk.set_index('Titles').to_dict()['Count']

In [36]:
# Replace the similarity value with left side and right side
new_match = matches_df.replace(kk_dict) 
new_match['Difference']=new_match['left_side']-new_match['right_side']
matches_df['left_side_value']=new_match['left_side']
matches_df['right_side_value']=new_match['right_side']
idx = (new_match['Difference'] <0)

In [37]:
# Put most frenquent title on the left side and less frenquent title on the right side 
aa=matches_df
aa.loc[idx,['left_side','right_side']] = aa.loc[idx,['right_side','left_side']].values
aa.loc[idx,['left_side_value','right_side_value']] = aa.loc[idx,['right_side_value','left_side_value']].values
aa['Difference']=aa['left_side_value']-aa['right_side_value']

In [38]:
# Switch left and right side
df_uniquetitle=pd.DataFrame({'right_side':flat_list_uni})
df_uniquetitle = pd.merge(df_uniquetitle, aa, how='left',left_on='right_side',right_on='right_side')    
df_uniquetitle=df_uniquetitle.sort_values(by=['left_side_value'])

In [39]:
# Covert the not na DF to Dict        
df_uniquetitle_notna=df_uniquetitle.dropna()
kkk_dict=df_uniquetitle_notna.set_index('right_side').to_dict()['left_side']

In [52]:
# Use the dictionary to replace the previous flattened unique titles in order to narrowdown the unique titles
df_uniquetitle_new=pd.DataFrame({'right_side':flat_list_uni})
df_uniquetitle_new= df_uniquetitle_new.replace(kkk_dict) 
uniquetitle_new = reduce(lambda l, x: l if x in l else l+[x], df_uniquetitle_new['right_side'], [])   

In [55]:
print("Total new uniquetitles: ",len(uniquetitle_new))

Total new uniquetitles:  22285
