Installing required packages

In [None]:
!pip install flask-ngrok

In [None]:
!pip install werkzeug

In [None]:
!pip install python-docx

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd /content/gdrive/MyDrive/Resume_Screening/

Loading the model and word vectorizer

In [None]:
import pickle

In [None]:
# load SGDClassifier
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
# load vectorizer
with open('vectorizer.pkl', 'rb') as f:
    word_vectorizer = pickle.load(f)

Categories 

In [None]:
categories = ["Advocate","Arts","Automation Testing","Blockchain","Business Analyst","Civil Engineer","Data Science","Database","DevOps Engineer","DotNet Developer","ETL Developer","Electrical Engineering","HR","Hadoop","Health and fitness","Java Developer","Mechanical Engineer","Network Security Engineer","Operations Manager","PMO","Python Developer","SAP Developer","Sales","Testing","Web Designing"]

Functions used in routes.

In [None]:
#fucntion to check if the given file is in the allowed extensions.
def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


In [None]:
#function to unzip file and load it onto google drive.
def unzip():
  extension = ".zip"

  for item in os.listdir(UPLOAD_FOLDER): # loop through items in dir
      if item.endswith(extension): # check for ".zip" extension
          file_name = UPLOAD_FOLDER + "/" + item
          zip_ref = zipfile.ZipFile(file_name) # create zipfile object
          zip_ref.extractall(UPLOAD_FOLDER) # extract file to dir
          zip_ref.close() # close file
          os.remove(file_name) # delete zipped file

In [None]:
#function to get the text from resumes.
def getText(filename):
  doc = docx.Document(filename)
  fullText = []
  #getting the complete text from docx file
  for para in doc.paragraphs:
      fullText.append(para.text)
  return '\n'.join(fullText)

In [None]:
#fucntion to clean resumes.
def cleaned_data(resume_data):
    #removing all URLs.
    resume_data = re.sub('http\S+\s*', ' ', resume_data)
    #removing RT and cc.
    resume_data = re.sub('RT|cc', ' ', resume_data)
    #removing hashtags
    resume_data = re.sub('#\S+', '', resume_data)
    #removing mentions
    resume_data = re.sub('@\S+', '  ', resume_data)
    #removing punctuations
    resume_data = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resume_data)
    #removing all non-ASCII values.
    resume_data = re.sub(r'[^\x00-\x7f]',r' ', resume_data)
    #removing extra whitespaces
    resume_data = re.sub('\s+', ' ', resume_data) 
    #removing numbers
    resume_data = re.sub('[0-9]+', ' ', resume_data)
    return resume_data

In [None]:
#function which sends the data through model and returns required data.
def process_folder(categories):
  directory = '/content/gdrive/MyDrive/Resume_Screening/Kaggle/Resumes'
  filenames = []
  # iterate over files in that directory.
  for filename in os.listdir(directory):
    filenames.append(filename)
  resumes = []
  for i in filenames:
    new_directory = directory+'/'+i
    text = getText(new_directory)
    #appending the text to a list.
    resumes.append(text)

  cleaned_resumes = []
  new_resume_data = {}
  pie_chart = {}
  for count,resume in enumerate(resumes):
    #cleaning the resume text
    cleaned_resume = cleaned_data(resume)
    #appending the cleaned data to a list.
    cleaned_resumes.append(cleaned_resume) 
    #getting the feature vector for the cleaned text.
    word_features = word_vectorizer.transform([cleaned_resume])
    #getting the probabilities for each category.
    probabilities = model.predict_proba(word_features)
    #getting the top 3 predictions from the model(but they are encoded labels.)
    best_three = np.argsort(probabilities, axis=1)[:,-3:]

    #storing the required data.
    best_three = best_three.tolist()
    best_three_list = []
    for i in best_three:
      for j in i: 
      #converting the encoded labels to the actual labels.
        best_three_list.append(categories[j])
        if categories[j] in pie_chart:
          pie_chart[categories[j]] += 1
        else : 
          pie_chart[categories[j]] = 1

      new_resume_data[count] = (best_three_list, resume, filenames[count])
  return new_resume_data, pie_chart, cleaned_resumes

In [None]:
#function for implementing search feature
def search(requested_categories, new_resume_data):
  requested_data1 = []
  requested_data2 = []
  requested_data3 = []

  required = len(requested_categories)
  while required > 0 : 
    if required == 3 :#if all 3 categories are mentioned.
      for i in range(0,len(new_resume_data)):
        count = 0
        for cat in new_resume_data[i][0]:
          if cat in requested_categories: 
            count += 1
        if count == required:
          requested_data3.append([new_resume_data[i][0], new_resume_data[i][1], new_resume_data[i][2]])
    elif required == 2 :#if only 2 categories are mentioned.
      for i in range(0,len(new_resume_data)):
        count = 0
        for cat in new_resume_data[i][0]:
          if cat in requested_categories: 
            count += 1

        if count == required:
          requested_data2.append([new_resume_data[i][0], new_resume_data[i][1], new_resume_data[i][2]])
    else:#if only 1 category is mentioned.
       for i in range(0,len(new_resume_data)):
        count = 0
        for cat in new_resume_data[i][0]:
            if cat in requested_categories: 
              count += 1

        if count == required:
          requested_data1.append([new_resume_data[i][0], new_resume_data[i][1], new_resume_data[i][2]])

    required -= 1

  return requested_data1, requested_data2, requested_data3

In [None]:
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
nltk.download('wordnet')

In [None]:
#function to generate wordCloud string which to used to get the word cloud.
def wordCloud_string(cleaned_resumes):
  #getting the complete text from the dataset.
  corpus = ""
  for i in range(0,len(cleaned_resumes)):
    corpus = corpus + cleaned_resumes[i].lower()
  #creating the tokenizer.
  tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
  tokens = tokenizer.tokenize(corpus)
  #getting the stop words from nltk dataset.
  stopwords = nltk.corpus.stopwords.words('english')
  #removing the stop words from the tokens.
  words = []
  for token in tokens:
      if token not in stopwords:
          words.append(token)

  lemmatizer = WordNetLemmatizer() 
  lemmatized_words = []
  for word in words : 
    word = lemmatizer.lemmatize(word)
    lemmatized_words.append(word)

  #converting into string
  string = ' '.join([i for i in lemmatized_words if not i.isdigit()])
  return string

In [None]:
!pip install python-docx

Importing required packages.

In [None]:
import base64
import os
import io
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import os,zipfile,docx
from flask_ngrok import run_with_ngrok
from flask import Flask, flash, request, redirect, url_for, render_template
from flask import send_from_directory
from werkzeug.utils import secure_filename
UPLOAD_FOLDER = '/content/gdrive/MyDrive/Resume_Screening/Kaggle'
ALLOWED_EXTENSIONS = {'zip'}
from flask import session

app = Flask(__name__, template_folder='/content/gdrive/MyDrive/Resume_Screening/web_templates', static_folder = '/content/gdrive/MyDrive/Resume_Screening/static')
app.secret_key = "supersecretkey"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
run_with_ngrok(app)   #starts ngrok when the app is run

#home route.
@app.route('/', methods=['GET', 'POST'])
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            return '<h1>No file part</h1>'
        file = request.files['file']
        # If the user does not select a file, the browser submits an
        # empty file without a filename.
        if file.filename == '':
            return '<h1>No file selected</h1>'
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            unzip()
            return redirect(url_for('index'))
    return render_template('home.html')

#index route -> lists all the features.
@app.route('/index',methods=['GET', 'POST'])
def index():
  if request.method == 'POST':
      if request.form['submit_button'] == 'Bar Graph':
        return redirect(url_for('bar_graph'))
      if request.form['submit_button'] == 'Pie Chart':
        return redirect(url_for('pie_chart'))
      if request.form['submit_button'] == 'Word Cloud':
        return redirect(url_for('wordcloud'))
      if request.form['submit_button'] == 'Search':
        return redirect(url_for('action_select'))
      
  return render_template('index.html')

#select route for search
@app.route('/action_select',  methods=['GET', 'POST'])
def action_select():
  all_categories = [None]
  for i in categories:
    all_categories.append(i)
  
  if request.method == 'POST':
    string1 = request.form['cat1']
    string2 = request.form['cat2']
    string3 = request.form['cat3']

    categories_int = []

    categories_int.append(int(string1))
    categories_int.append(int(string2))
    categories_int.append(int(string3))

    requested_categories1 = []

    for i in categories_int:
      if(i == 0): 
        continue
      requested_categories1.append(all_categories[i])

    new_resumes, pc, cr = process_folder(categories)
    req1, req2, req3 = search(requested_categories1, new_resumes)

    req1_out = []
    req2_out = []
    req3_out = []

    for i in req3:
      req3_out.append([i[0], i[1], i[2]])

    for i in req2:
      req2_out.append([i[0], i[1], i[2]])

    for i in req1:
      req1_out.append([i[0], i[1], i[2]])
 
    return render_template("skills_display.html", allthree = req3_out, n1 = len(req3_out), onlytwo = req2_out, n2 = len(req2_out), justone = req1_out, n3 = len(req1_out))   


  return render_template("select.html", categories = all_categories, n = len(all_categories))

#route for displaying pie chart.
@app.route('/pie_chart')
def pie_chart():
  new_resumes, pc, cr = process_folder(categories)
  img = io.BytesIO()
  plt.figure(figsize=(15,15))
  plt.pie(pc.values())
  plt.title('Category Pie Chart')
  plt.legend(labels = pc.keys())
  plt.savefig(img, format='png')
  img.seek(0)

  plot_url = base64.b64encode(img.getvalue()).decode()
  return '<img src="data:image/png;base64,{}">'.format(plot_url)

#route for displaying bar graph
@app.route('/bar_graph')
def bar_graph():
  nr, pc, cr = process_folder(categories)
  img = io.BytesIO()
  plt.figure(figsize=(15,15))
  plt.xticks(rotation=90)
  plt.title('Category Count')
  keys = list(pc.keys())
  # get values in the same order as keys, and parse percentage values
  vals = [float(pc[k]) for k in keys]
  ax = sns.barplot(x=keys, y=vals)
  for i, p in enumerate(ax.patches):
      height = p.get_height()
      ax.text(p.get_x()+p.get_width()/2., height + 0.1, vals[i],ha="center")
  plt.savefig(img, format='png')
  img.seek(0)

  plot_url = base64.b64encode(img.getvalue()).decode()
  return '<img src="data:image/png;base64,{}">'.format(plot_url)

#route for displaying word cloud.
@app.route('/wordcloud')
def wordcloud():
  nr, pc, cleaned_resumes = process_folder(categories)
  output_str = wordCloud_string(cleaned_resumes)
  img = io.BytesIO()
  plt.subplots(figsize=(15,15))
  wordcloud = WordCloud(background_color = 'black', max_words = 100, width = 1500, height = 1500).generate(output_str)
  plt.imshow(wordcloud)
  plt.title('Resume Text WordCloud (100 Words)')
  plt.axis('off')
  plt.savefig(img, format='png')
  img.seek(0)

  plot_url = base64.b64encode(img.getvalue()).decode()
  return '<img src="data:image/png;base64,{}">'.format(plot_url)

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://565a951d0880.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [05/Aug/2021 05:27:35] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:27:35] "[37mGET /static/styles/home.css HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:27:35] "[37mGET /static/styles/8619.jpg HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:27:36] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [05/Aug/2021 05:27:50] "[32mPOST / HTTP/1.1[0m" 302 -
127.0.0.1 - - [05/Aug/2021 05:27:50] "[37mGET /index HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:27:51] "[37mGET /static/styles/index.css HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:28:00] "[32mPOST /index HTTP/1.1[0m" 302 -
127.0.0.1 - - [05/Aug/2021 05:28:07] "[37mGET /bar_graph HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:28:11] "[32mPOST /index HTTP/1.1[0m" 302 -
127.0.0.1 - - [05/Aug/2021 05:28:34] "[37mGET /wordcloud HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:28:54] "[37mGET /wordcloud HTTP/1.1[0m" 200 -
127.0.0.1 - - [05/Aug/2021 05:29:01] "[32mPOST /