# Pre-Processing Resume Text Column to Prepare for Matching - final 

In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

import re
import datetime
from datetime import date
from time import strptime

import RAKE as rake
import operator


###############################################################################################
## Working on Resume data
###############################################################################################

In [2]:
# First reading my resume csv
resume = pd.read_csv('wip/resume_sorted6.csv')

In [3]:
#initial info
resume.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14428 entries, 0 to 14427
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   index                   14428 non-null  int64 
 1   Resume_title            14428 non-null  object
 2   City                    14428 non-null  object
 3   location                14428 non-null  int64 
 4   Description             14428 non-null  object
 5   work_experiences        14428 non-null  object
 6   Educations              14428 non-null  object
 7   Skills                  14428 non-null  object
 8   Links                   14428 non-null  object
 9   Certificates            14428 non-null  object
 10  Additional Information  14428 non-null  object
 11  is_grad                 14428 non-null  int64 
 12  is_postgrad             14428 non-null  int64 
 13  is_doc                  14428 non-null  int64 
 14  edu_unknown             14428 non-null  int64 
 15  Co

#########################################################################################################
## To match resume with jobs, I need to have similar 20 vectors, that I created to train my Doc2Vec model for jobs. 

### For training my jobs model, I picked  text data from :
* job title
* job description
* skills
* industry

### So for training my resume model, I need similar text, thus picking:
* Resume_title
* Resume description 
* skills
* Additional Information


#########################################################################################################

In [40]:
resume['Resume_title'] = resume['Resume_title'].str.lower()
resume['Skills']=resume['Skills'].str.lower()
resume['Description'] = resume['Description'].str.lower()
resume['Additional Information'] = resume['Additional Information'].str.lower()

In [41]:
resume['Description'].replace('none', ' ',inplace=True)
resume['Additional Information'].replace('none', ' ',inplace=True)

In [5]:
df_resume = resume[['resume_id','Resume_title' ]]
df_resume['resume_combo'] = resume['Resume_title'] +" " + resume['Description'] +" " + resume['Skills'] + " "+resume['Additional Information'] + " "+resume['experience_desc']
df_resume.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,resume_id,Resume_title,resume_combo
0,0,java developer,"java developer to prove myself dedicated, wort..."
1,1,software developer,software developer working as software develop...
2,2,java developer,java developer looking for a challenging caree...
3,3,seeking innovative and challenging career assi...,seeking innovative and challenging career assi...
4,4,java developer,java developer ['project: hr payroll systems...


In [6]:
docs = df_resume['resume_combo']
docs_sample = docs.head(10)
docs_sample

0    java developer to prove myself dedicated, wort...
1    software developer working as software develop...
2    java developer looking for a challenging caree...
3    seeking innovative and challenging career assi...
4    java developer   ['project: hr payroll systems...
5    java developer   ['java']   ['have the potenti...
6    java developer to secure a challenging positio...
7    searching job for java developer   ['c++', ' h...
8    mca / with 3 years of development experience •...
9    java developer attain the position of 'java de...
Name: resume_combo, dtype: object

In [7]:
#Import all the dependencies
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
set(stopwords.words('english'))

import string

import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shail\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('ã¯æ’ëœ')
stopwords.append('\n')
stopwords.append('•')
#Transforms words to TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords)

index = 0
keys = {}

for rem in df_resume.itertuples() :
    key = rem[1]
    keys[key] = index
    index += 1

#Fit the vectorizer to the data
vectorizer.fit(df_resume['resume_combo'].fillna(''))

#Transform the data
tfidf_scores = vectorizer.transform(df_resume['resume_combo'].fillna(''))

print(tfidf_scores.shape)
print(df_resume.shape)

  'stop_words.' % sorted(inconsistent))


(14428, 70688)
(14428, 3)


In [10]:
test = pd.DataFrame(tfidf_scores.toarray(), columns = vectorizer.get_feature_names())

In [11]:
test.head()

Unnamed: 0,00,000,0000,00089765,00089805,000webhostapp,001,002,003,00353,...,õle,øcreated,ǁǁǁǁǁǁ,ηadoop,τrain,τοοls,чєαr,ﬁled,ﬁnancial,ﬁxing
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating my Stopword list
#### As seen there are so many unwanted tokens like numbers,ïƒ¼ etc , I need to add them in "stop words" list to train model

In [12]:
#getting list of all tokens
word_list = test.columns.tolist()

In [13]:
##Getting a list of unwanted words as s_words and adding to stopwords
s_words =[]
for word in word_list:
    #print(word)
    if re.search("^\W|^\d",word):
        s_words.append(word)

In [14]:
s_words.append('')        
from nltk.corpus import stopwords
stopword_set = set(stopwords.words('english'))
stopword_set = list(stopword_set)
stopword_set.extend(s_words)

In [15]:
def preprocess(text):
    stop_words = stopword_set
    #0. split words by whitespace
    text = text.split()
    
    
    # 1. lower case
    text = [word.lower() for word in text]
    
    # 2. remove punctuations
    punc_table = str.maketrans('','',string.punctuation)
    text = [word.translate(punc_table) for word in text]
    
    # 3. remove stop words
    text = [word for word in text if word not in stop_words]
    
    return text

In [16]:
tokenized_doc = []
doc = df_resume['resume_combo']
#doc = docs_sample
for d in doc:
    tokenized_doc.append(preprocess(d))
#tokenized_doc

In [17]:
# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

In [18]:
num_doc = len(tagged_data)
num_doc
#confirm length (should be 14428)
len(tokenized_doc)

14428

In [21]:
## Load saved doc2vec model
model= Doc2Vec.load("Model/my_doc2vec_v2.model")

In [24]:
## Get vector value
vec = np.empty([14428,20])

for k,i in enumerate(tokenized_doc):
    
    #print(i)
    vector = model.infer_vector(i)
    vec[k] = vector

# reshape into 2D
new_arr = np.reshape(vec,(-1,20))

In [25]:
rng = range(1, 21)
vec_df = pd.DataFrame(new_arr, columns=['vec_' + str(i) for i in rng])

In [26]:
vec_df.head(5)

Unnamed: 0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,vec_10,vec_11,vec_12,vec_13,vec_14,vec_15,vec_16,vec_17,vec_18,vec_19,vec_20
0,3.003397,1.462391,-0.732206,2.032145,-3.291425,1.626622,1.269785,-1.303818,-1.78169,-3.893606,0.582851,-2.39043,0.612694,4.274847,-1.641325,1.098874,-0.534998,0.338975,-2.081308,-3.480031
1,3.969832,-1.478794,-1.997424,1.502539,-3.507508,2.108994,-0.38664,1.494396,0.454764,-2.268685,-1.505257,-2.332494,-0.431022,1.431269,-0.896382,-0.267269,1.433352,0.438305,-0.992093,-0.096142
2,1.442701,0.011723,-2.126506,0.655804,-3.984513,0.792035,1.317094,-0.69671,-1.563318,-3.040591,-0.367393,-3.774975,-1.183595,2.456486,-1.270981,2.475039,-1.99011,0.130853,-0.589791,-2.782936
3,1.803033,-0.120398,-1.159959,0.066225,-3.522508,1.321965,-0.756211,-0.24901,-0.074644,-2.314389,0.557041,-3.887409,-1.070027,3.894971,-0.957399,-0.952996,-0.824266,0.038712,1.194561,-1.206788
4,-0.434019,0.551527,-1.531551,-0.767032,-0.514473,0.286549,-0.563888,0.310748,0.457921,-1.334632,0.18315,-0.547834,1.218995,0.536182,0.995981,-0.87473,-0.138916,0.882186,-0.129402,-1.793177


In [27]:
# concatenate and safe the resume csv file
con_resume_1 = pd.concat([resume, vec_df], axis=1)
con_resume_1.to_csv('wip/con_resume_1.csv', index=False)