# Pre-Processing Resume Text Column to Prepare for matching - first iteration

In [3]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

import re
import datetime
from datetime import date
from time import strptime

import RAKE as rake
import operator


###############################################################################################
## Working on Resume data
###############################################################################################

In [38]:
# First reading my resume csv
resume = pd.read_csv('wip/resume_sorted5.csv')

In [39]:
#initial info
resume.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14428 entries, 0 to 14427
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   index                   14428 non-null  int64 
 1   Resume_title            14428 non-null  object
 2   City                    14428 non-null  object
 3   location                14428 non-null  int64 
 4   Description             14428 non-null  object
 5   work_experiences        14428 non-null  object
 6   Educations              14428 non-null  object
 7   Skills                  14428 non-null  object
 8   Links                   14428 non-null  object
 9   Certificates            14428 non-null  object
 10  Additional Information  14428 non-null  object
 11  is_grad                 14428 non-null  int64 
 12  is_postgrad             14428 non-null  int64 
 13  is_doc                  14428 non-null  int64 
 14  edu_unknown             14428 non-null  int64 
 15  Co

#########################################################################################################
## To match resume with jobs, I need to have similar 20 vectors, that I created to train my Doc2Vec model for jobs. 

### For training my jobs model, I picked  text data from :
* job title
* job description
* skills
* industry

### So for training my resume model, I need similar text, thus picking:
* Resume_title
* Resume description 
* skills
* Additional Information


#########################################################################################################

In [40]:
resume['Resume_title'] = resume['Resume_title'].str.lower()
resume['Skills']=resume['Skills'].str.lower()
resume['Description'] = resume['Description'].str.lower()
resume['Additional Information'] = resume['Additional Information'].str.lower()

In [41]:
resume['Description'].replace('none', ' ',inplace=True)
resume['Additional Information'].replace('none', ' ',inplace=True)

In [43]:
df_resume = resume[['resume_id','Resume_title' ]]
df_resume['resume_combo'] = resume['Resume_title'] +" " + resume['Description'] +" " + resume['Skills'] + " "+resume['Additional Information'] + " "+resume['experience_desc']
df_resume.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,resume_id,Resume_title,resume_combo
0,0,java developer,"java developer to prove myself dedicated, wort..."
1,1,software developer,software developer working as software develop...
2,2,java developer,java developer looking for a challenging caree...
3,3,seeking innovative and challenging career assi...,seeking innovative and challenging career assi...
4,4,java developer,java developer ['project: hr payroll systems...


In [44]:
docs = df_resume['resume_combo']
docs_sample = docs.head(10)
docs_sample

0    java developer to prove myself dedicated, wort...
1    software developer working as software develop...
2    java developer looking for a challenging caree...
3    seeking innovative and challenging career assi...
4    java developer   ['project: hr payroll systems...
5    java developer   ['java']   ['have the potenti...
6    java developer to secure a challenging positio...
7    searching job for java developer   ['c++', ' h...
8    mca / with 3 years of development experience •...
9    java developer attain the position of 'java de...
Name: resume_combo, dtype: object

In [45]:
#Import all the dependencies
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
set(stopwords.words('english'))

import string

import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shail\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('ã¯æ’ëœ')
stopwords.append('\n')
stopwords.append('•')
#Transforms words to TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords)

index = 0
keys = {}

for rem in df_resume.itertuples() :
    key = rem[1]
    keys[key] = index
    index += 1

#Fit the vectorizer to the data
vectorizer.fit(df_resume['resume_combo'].fillna(''))

#Transform the data
tfidf_scores = vectorizer.transform(df_resume['resume_combo'].fillna(''))

print(tfidf_scores.shape)
print(df_resume.shape)

  'stop_words.' % sorted(inconsistent))


(14428, 70688)
(14428, 3)


In [48]:
test = pd.DataFrame(tfidf_scores.toarray(), columns = vectorizer.get_feature_names())

In [49]:
test.head()

Unnamed: 0,00,000,0000,00089765,00089805,000webhostapp,001,002,003,00353,...,õle,øcreated,ǁǁǁǁǁǁ,ηadoop,τrain,τοοls,чєαr,ﬁled,ﬁnancial,ﬁxing
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating my Stopword list
#### As seen there are so many unwanted tokens like numbers,ïƒ¼ etc , I need to add them in "stop words" list to train model

In [50]:
#getting list of all tokens
word_list = test.columns.tolist()

In [51]:
##Getting a list of unwanted words as s_words and adding to stopwords
s_words =[]
for word in word_list:
    #print(word)
    if re.search("^\W|^\d",word):
        s_words.append(word)

In [52]:
s_words.append('')        
from nltk.corpus import stopwords
stopword_set = set(stopwords.words('english'))
stopword_set = list(stopword_set)
stopword_set.extend(s_words)

In [53]:
def preprocess(text):
    stop_words = stopword_set
    #0. split words by whitespace
    text = text.split()
    
    
    # 1. lower case
    text = [word.lower() for word in text]
    
    # 2. remove punctuations
    punc_table = str.maketrans('','',string.punctuation)
    text = [word.translate(punc_table) for word in text]
    
    # 3. remove stop words
    text = [word for word in text if word not in stop_words]
    
    return text

In [54]:
tokenized_doc = []
doc = df_resume['resume_combo']
#doc = docs_sample
for d in doc:
    tokenized_doc.append(preprocess(d))
#tokenized_doc

In [55]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [56]:
num_doc = len(tagged_data)
num_doc
#confirm length (should be 14428)
len(tokenized_doc)

14428

In [58]:
from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

class EpochSaver(CallbackAny2Vec):

    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0

    def on_epoch_end(self, model):
        output_path = get_tmpfile('{}_epoch{}.model'.format(self.path_prefix, self.epoch))
        model.save(output_path)
        self.epoch += 1

In [59]:
class EpochLogger(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [61]:
## Load saved doc2vec model
model= Doc2Vec.load("Model/my_doc2vec.model")

In [62]:
## Get vector value
vec = np.empty([14428,20])

for k,i in enumerate(tokenized_doc):
    
    #print(i)
    vector = model.infer_vector(i)
    vec[k] = vector
    #vec = np.append(vector)
    #vecf = np.append(vec,vector)

# reshape into 2D
new_arr = np.reshape(vec,(-1,20))

In [64]:
rng = range(1, 21)
vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])

In [65]:
vec_df.head(5)

Unnamed: 0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,vec_10,vec_11,vec_12,vec_13,vec_14,vec_15,vec_16,vec_17,vec_18,vec_19,vec_20
0,-3.145642,-0.40938,0.70116,-0.938745,0.585239,3.585946,-0.120781,0.111222,1.644105,2.184981,-2.117909,-0.08543,-2.877392,0.239383,-1.582871,1.435642,-1.05145,1.960831,1.786694,-2.375981
1,-0.786235,-1.306011,-1.383107,-1.669708,0.832136,1.84979,0.178872,-1.736894,0.741685,1.553933,-2.916478,-0.712572,-0.502129,-0.849293,0.435406,0.33933,0.060282,-0.415035,3.203696,-3.607635
2,-2.747642,-1.721797,-0.910322,-0.77595,1.472325,2.455998,-0.85215,-0.150517,0.844202,1.380623,-1.018832,0.777981,-1.977556,0.853214,-1.281344,2.195391,0.800305,1.078035,2.1669,-2.658121
3,-1.77177,-1.37585,-0.475922,-0.784473,-0.37724,1.596389,1.09422,-0.253642,0.468265,2.149588,-1.234415,0.295536,-2.615532,0.115959,-2.044196,-0.769109,-0.716604,1.145388,3.452934,-1.008162
4,-0.097372,-1.405603,-0.801234,-0.248921,-0.376417,-0.15705,-0.29044,-1.440582,-0.169669,1.190537,-0.291407,-1.0805,-2.950497,0.031693,0.119182,-0.883555,0.178819,-0.858324,1.239632,-0.043914


In [66]:
con_resume = pd.concat([resume, vec_df], axis=1)
con_resume.to_csv('wip/con_resume.csv', index=False)

In [44]:
#con_resume.info()

In [49]:
tokenized_doc = []
#doc = df_resume['resume_combo']
doc = docs_sample
for d in doc:
    tokenized_doc.append(preprocess(d))
#tokenized_doc

In [50]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [51]:
num_doc = len(tagged_data)
num_doc
#confirm length (should be 38941)
len(tokenized_doc)

10

In [58]:
## Load saved doc2vec model
model= Doc2Vec.load("my_doc2vec.model")

## Get vector value
vec = np.empty([10,20])

for k,i in enumerate(tokenized_doc):
    
    #print(i)
    vector = model.infer_vector(i)
    vec[k] = vector
    #vec = np.append(vector)
    #vecf = np.append(vec,vector)

# reshape into 2D
new_arr = np.reshape(vec,(-1,20))

In [None]:
test = np.array([[1,2,3],[4,5,6]])
test[0]

In [61]:
new_arr[0]

array([-3.14492106, -0.41021681,  0.70149601, -0.93887955,  0.58496076,
        3.58589458, -0.12033088,  0.11019378,  1.64519656,  2.18371987,
       -2.11720061, -0.08485675, -2.87654066,  0.24021174, -1.58367932,
        1.43522847, -1.05121636,  1.96061814,  1.78778028, -2.37729073])

In [62]:
rng = range(1, 21)
vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])

In [63]:
vec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 20 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   vec_1   10 non-null     float64
 1   vec_2   10 non-null     float64
 2   vec_3   10 non-null     float64
 3   vec_4   10 non-null     float64
 4   vec_5   10 non-null     float64
 5   vec_6   10 non-null     float64
 6   vec_7   10 non-null     float64
 7   vec_8   10 non-null     float64
 8   vec_9   10 non-null     float64
 9   vec_10  10 non-null     float64
 10  vec_11  10 non-null     float64
 11  vec_12  10 non-null     float64
 12  vec_13  10 non-null     float64
 13  vec_14  10 non-null     float64
 14  vec_15  10 non-null     float64
 15  vec_16  10 non-null     float64
 16  vec_17  10 non-null     float64
 17  vec_18  10 non-null     float64
 18  vec_19  10 non-null     float64
 19  vec_20  10 non-null     float64
dtypes: float64(20)
memory usage: 1.7 KB


In [35]:
r1.to_csv('test_r.csv',index=False)

In [24]:
r1 = resume.head(10)

In [36]:
# read each work experience
resume['work_experiences'] = resume['work_experiences'].str.lower()

resume_all_desc = []
for index, rows in resume.iterrows():
    #print('#@#@#@#@#@@#@#@#@#@##@@#@#@@##@#@#@#@#@#@##@#@#@##@#@@#@#@#')
    #print(f'resume no. {index}')
    resume_desc= []
    #pick work experience col and read it as JSON 
    
    work = resume['work_experiences'][index]
    try: result_work = eval(work)
    except: continue
    #print(f'resume  :  {index}')
    #read description to match with job
    
    for i in result_work:    
        w_title_n = (result_work[0][0]['wtitle:'])            
        w_company= (result_work[i][1]['wcompany:'])
#         resume_desc.append(w_company) 
        w_city= (result_work[i][2]['wcity:'])
        w_state= (result_work[i][3]['wstate:'])
        w_duration= (result_work[i][4]['wduration:'])
           
        w_descr= (result_work[i][5]['wdescr:'])
        if  (w_descr == 'none'):
            continue
        #print(w_descr)
        #print('**************')
        resume_desc.append(w_descr + '')   
        
    #print(resume_desc)
    resume_all_desc.append(resume_desc)
#print(resume_test)
resume['experience_desc'] = resume_all_desc

In [37]:
#resume.to_csv('wip/resume_sorted5.csv',index=False)