Companies often receive thousands of resumes for each job posting and employ dedicated screening officers to screen qualified candidates. Finding suitable candidates for an open role from a database of 1000s of resumes can be a tough task. Automated resume categorization can speeden the candidate selection process. Such automation can really ease the tedious process of fair screening and shortlisting the right candidates and aid quick decisionmaking.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib.gridspec import GridSpec
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud

#### Downloading the data

In [None]:
#@title Download the data
!wget -qq https://cdn.iisc.talentsprint.com/CDS/Datasets/UpdatedResumeDataSet.csv

Read the UpdatedResume

In [None]:
df = pd.read_csv('UpdatedResumeDataSet.csv', encoding='utf-8')
df.head()

### Pre-processing and EDA

Display  all the categories of resumes and their counts in the dataset



In [None]:
# Displaying the distinct categories of resume
print(df['Category'].unique())

In [None]:
# Displaying the distinct categories of resume and the number of records belonging to each category
print(df['Category'].value_counts())

Create the count plot of different categories

In [None]:
plt.figure(figsize=(15,15))
plt.xticks(rotation=90)
sns.countplot(y="Category", data=df)

Create a pie plot depicting the percentage of resume distributions category-wise

In [None]:
targetCounts = df['Category'].value_counts()
targetLabels  = targetCounts.index
# Make square figures and axes
plt.figure(1, figsize=(25,25))
the_grid = GridSpec(2, 2)


cmap = plt.get_cmap('coolwarm')
colors = [cmap(i) for i in np.linspace(0, 1)]
plt.subplot(the_grid[0, 1], aspect=1, title='RESUME CATEGORY DISTRIBUTION')

source_pie = plt.pie(targetCounts, labels=targetLabels, autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()

Convert all the `Resume` text to lower case 




In [None]:
# Convert all characters to lowercase
df['Resume']=df['Resume'].str.lower()
print(df['Resume'])

Cleaning Resume

Define a function to clean the resume text


In [None]:
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [None]:
df['cleaned_resume'] = df['Resume'].apply(lambda x: cleanResume(x))
print(df['cleaned_resume'][31])

In [None]:
df.head()    # data after cleaning the resume

In [None]:
sent_lens = []
for i in df.cleaned_resume:
    length = len(i.split())
    sent_lens.append(length)

print(len(sent_lens))
print(max(sent_lens))

In [None]:
df["Resume"][100] ,  df["cleaned_resume"][100]

### Stop Words Removal

Use `nltk` package to find the most common words from the `cleaned resume` column

In [None]:
# stop words
oneSetOfStopWords = set(stopwords.words('english')+['``',"''"])
oneSetOfStopWords

In [None]:
# most common words
totalWords =[]
Sentences = df['cleaned_resume'].values
cleanedSentences = ""
for i in range(0,160):
    cleanedText = cleanResume(Sentences[i])
    cleanedSentences += cleanedText
    requiredWords = nltk.word_tokenize(cleanedText)
    for word in requiredWords:
        if word not in oneSetOfStopWords and word not in string.punctuation:
            totalWords.append(word)

wordfreqdist = nltk.FreqDist(totalWords)
mostcommon = wordfreqdist.most_common(50)
print(mostcommon)

In [None]:
wc = WordCloud().generate(cleanedSentences)
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

Convert the categorical variable `Category` to a numerical feature and make a different column, which can be treated as the target variable

In [None]:
labelencoder = LabelEncoder()
df["Category_Labelled"] = labelencoder.fit_transform(df["Category"])
# print(type(labels))

df.head()

### Feature Extraction

Convert the text to feature vectors by applying `tfidf vectorizer` to the Label encoded category made above.



In [None]:
Text = df['cleaned_resume'].values
op_labels = df['Category_Labelled'].values
word_vectorizer = TfidfVectorizer(max_features = 1500)
word_vectorizer.fit(Text)
features = word_vectorizer.transform(Text)

In [None]:
word_vectorizer.get_feature_names()

## Naive Bayes Classifier

Split the data into train and test sets. Apply Naive Bayes Classifier (MultinomialNB)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(features,op_labels,random_state=0, test_size=0.2)

In [None]:
clf = MultinomialNB()

In [None]:
clf.fit(X_train, y_train)

In [None]:
print('Accuracy of NaiveBayes Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))

In [None]:
pred = clf.predict(X_test)

In [None]:
print("\n Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(y_test, pred)))