In [1]:
#linguistics
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from sentence_transformers import SentenceTransformer, util

# math tools
import statistics
import pandas as pd
import pathlib as pl
import matplotlib.pyplot as plt

#visual and file handeling tools
import pickle
import fitz
import requests
from bs4.element import Tag
from textblob import TextBlob
from bs4 import BeautifulSoup
from googlesearch import search
from rich.markdown import Markdown

#system tools
import ast
import random
import re
import os
from rich import print as prt
import clipboard as cb

#data
from train_data.cleaned_train_dataset_2 import dataset

In [2]:
class utils:
    # convert tuples to list and return converted data
    def prepareData(train_data):
        data = []
        for text, annot in train_data:
            ent = []
            for strt, end, lbl in annot['entities']:
                ent.append([strt, end, lbl])
            annot['entities'] = ent
            data.append([text, annot])
        return data

    # visualise training data using displacy
    def renderData(data, n=0, serve=False):
        nlp = spacy.blank('en')
        data0 = [data[n]]
        for text, annotations in data0:
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annotations['entities']:
                span = doc.char_span(start, end, label=label)
                if(span!=None):
                    ents.append(span)
            doc.ents = ents
        if serve:
            displacy.serve(doc, style='ent')
        else:
            displacy.render(doc, style='ent')


    '''
        convert training dataset from v2 to v3 using docbin
        note use filter_span to to get rid of the span errors
    '''
    def v2Tov3Converter(data, filename="train", rigrousFilter=False):
        nlp = spacy.blank("en") 
        db = DocBin() # DocBin will store the example documents
        for text, annotations in data:
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annotations['entities']:
                span = doc.char_span(start, end, label=label)
                if span == None:
                    continue
                    
                if(rigrousFilter):  # rigrously filter the spans.
                    if span.text.isspace()==True:
                        continue
                    proceed = True
                    span_text = span.text
                    if span_text[0]==' ' or span_text[len(span_text)-1]==' ':
                        continue
                    for char in span.text:
                        if char.isalnum():
                            continue
                        else:
                            proceed = False

                    if proceed:
                        ents.append(span)

                ents.append(span)

                if(rigrousFilter):
                    ents = filter_spans(ents)
                
            doc.ents = ents
            doc = utils.remove_whitespace_entities(doc)
            db.add(doc)
        filename=filename+".spacy"
        db.to_disk(filename)
        return list(db.get_docs(nlp.vocab))
    
    def initializeConfig():
        # initialize config.cfg file for ner training
        os.system("spacy init config --lang en --pipeline ner config.cfg --force")
    
    def trainModel():
        # train the model using config.cfg file. and save the model in trained_model folder.
        os.system("spacy train config.cfg --output ./trained_model/ --paths.train ./train.spacy --paths.dev ./train.spacy")
    
    def loadTrainData():
        train_data = pickle.load(open('train_data/train_data.pkl', 'rb'))
        return train_data
    
    def prepareDocSpansHtml(doc):
        text = doc[0]
        annotations = doc[1]['entities']
        md = ""
        switcher = 0
        for i in range(len(text)+1):
            for j in annotations:
                if j[0] == i:
                    if switcher:
                        md+='<span style="color: red;">*'
                    else:
                        md+='<span style="color: green;">*'
                    switcher = not switcher
                if j[1] == i:
                    md+='*</span>'
            if i<len(text):
                md+=text[i]
        cb.copy(md)
        md = Markdown(md)

    def printSpans(data):
        for i in data[1]['entities']:
            prt(data[0][i[0]: i[1]], " :", i[2])
    
    def printErrorSpans(dataset):
        ar = []
        for i in range(len(dataset)):
            anno = dataset[i][1]['entities']
            for j in anno:
                span = dataset[i][0][j[0]:j[1]]
                l = len(span)
                if(l>0):
                    if(span[0]==' ' or span[l-1]==' '):
                        prt(span, i)
                        if i not in ar:
                            ar.append(i)
                if(l<1):
                    prt("[bold green]0 length span[/bold green] - " ,i, dataset[i][0])
        prt(ar)

## loading and tesing trained model

In [276]:
path = pl.Path('../meta_data/forbes_2000/forbes_2000.csv')
forbes_companies = pd.read_csv(path)
for i in range(7):
    forbes_companies = forbes_companies.drop(index=i)
col = list(forbes_companies.columns)
col[2] = 'company name'
forbes_companies.columns = col
forbes_companies = list(forbes_companies['company name'])
forbes_companies = [i.lower() for i in forbes_companies]
forbes_companies = [j.split(" ")[0] for j in forbes_companies]
forbes_companies = list(dict.fromkeys(forbes_companies))

In [201]:
class webScrapper:
    def getGoogleSearchResults(query)->list:
        results = list(search(query, tld="co.in", num=1, stop=1, pause=0.5))
        return results

    def getPageText(url):
        result = requests.get(url)
        html_text = result.content
        soup = BeautifulSoup(html_text)

        for script in soup(['script', 'style']):
            script.decompose()
        strips = list(soup.stripped_strings)
        final_string = ''
        for i in strips:
            if len(i)>100:
                final_string+=i
                final_string+=" "
        return final_string

    def getText(query):
        search_res = webScrapper.getGoogleSearchResults(query)
        text = webScrapper.getPageText(search_res[0])
        return text

In [281]:
class score_parameter:

    def __init__(self):
        self.scemantic_model = SentenceTransformer('stsb-roberta-large')

    def getScemanticSimilarity(self, sentence1, sentence2):
        # encode sentences to get their embeddings
        embedding1 = self.scemantic_model.encode(sentence1, convert_to_tensor=True)
        embedding2 = self.scemantic_model.encode(sentence2, convert_to_tensor=True)
        
        # compute similarity scores of two embeddings
        cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
        return cosine_scores.item()

    def getResumeFitness(self, spans, jdtext):
        
        scores = []
        if('designation' in spans.keys()):
            if(len(spans['designation'])>5):
                desg = spans['designation'][:5]
            else:
                desg = spans['designation']
            for i in desg:
                text = webScrapper.getText(i)
                scores.append(score_parameter.getScemanticSimilarity(self, sentence1=text, sentence2=jdtext))
        if (len(scores)>0):
            return statistics.mean(scores)*100
        else:
            return 0

    
    def getConfidenceScore(self, text):
        return TextBlob(text).sentiment
    

    def jobSteadynessScore(self, spans):
        # longer one stays in one company->better it is
        optimum_duration = 5
        work_duration = 0

        if('total experience' in spans.keys()):
            ar = (spans['total experience'][0])
            count = 0
            str = ''
            nums = []
            for i in ar:
                if(((ord(i)>=48 and ord(i)<=57) or i=='.') and count<2):
                    if(i!='.'):
                        count+=1
                    str+=i
                if(i==' ' and str.isalnum()):
                    nums.append(float(str))
                    str = ''
            if(len(nums)>1):
                work_duration = nums[0]+nums[1]/12
            elif(len(nums)>0):
                work_duration = nums[0]
            elif(str.isalnum()):
                work_duration = float(str)

        elif('experience duration' in spans.keys()):
            ar = spans['experience duration']
            for j in ar:
                count = 0
                str = ''
                for i in j:
                    if(((ord(i)>=48 and ord(i)<=57) or i=='.') and count<8):
                        count+=1
                        str+=i
                d1 = str[:4]
                d2 = str[4:]
                if(count==8):
                    d1 = int(d1)
                    d2 = int(d2)
                    work_duration+=abs(d1-d2)
        ans = (work_duration/optimum_duration)*100
        if(ans<100):
            return ans
        elif(ans>100):
            return 100
        elif(ans==0):
            return random.randint(0, 30)


    def orgProfileScore(self, spans):
        #return the number of forbes companies worked at.
        orgCount=0
        for i in spans['companies worked at']:
            for j in i.split(" "):
                if(j.lower() in forbes_companies):
                    orgCount+=1
        return orgCount

    def certificationScore(self, spans):
        if ('certification' in spans.keys()):
            return len(spans['certification'])
        return 0

    def winsScore(self, spans):
        wins = 0
        if('wins' in spans.keys()):
            wins = len(spans['wins'])
        return wins



In [211]:
colors = {
    'companies worked at': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)',
}
options = {
    'ents': ['companies worked at'], "colors": colors
}

class tools:
    def loadPdfs(dir='../meta_data/resume and jd/resumes'):
        path = pl.Path(dir)
        return list(path.glob("*.pdf"))

    def extractTextFromPdf(path, pdf_number):
        pdf = fitz.open(path[pdf_number])
        text = ''
        for page in pdf:
            text += str(page.getText())
        text = " ".join(text.split('\n'))
        return text
    
    def loadTextData():
        texts = [i[0] for i in dataset]
        return texts


class linguistics(tools):
    def __init__(self, last=True) -> None:
        if(last):
            model_dir = './trained_model/model-last'
        else:
            model_dir = './trained_model/model-best'
        self.model = spacy.load(model_dir)
        self.parameters = score_parameter()
    

    def render(self, text):
        displacy.render(self.model(text), style='ent')

    
    def getUniqueSpans(self, text)->dict:
        doc = self.model(text)
        labels = dict()
        for i in doc.ents:
            if i.label_ not in labels.keys():
                labels[i.label_] = [i.text]
            else:
                labels[i.label_].append(i.text)
        for i in labels.keys():
            labels[i] = list(set(labels[i]))
        return labels


    def printEntitySpansAndLabels(self, text):
        doc = self.model(text)
        for i in doc.ents:
            print(i, " | ", i.label_)


    def getAllScores(self, text, jdtext)->dict:
        spans = self.getUniqueSpans(text)
        try:
            prt("resume fitness score: ", self.parameters.getResumeFitness(spans, jdtext))
        except:
            prt('failed to calculate resume fitness score')
        

        try:
            prt("organization profile score: ", self.parameters.orgProfileScore(spans))
        except:
            prt('failed to calculate resume profile score')


        try:
            prt("job steadyness score: ", self.parameters.jobSteadynessScore(spans))
        except:
            prt('failed to calculate resume job steadyness score')


        try:
            prt("wins score: ", self.parameters.winsScore(spans))
        except:
            prt('failed to calculate resume wins score')


        try:
            prt("certification score: ", self.parameters.certificationScore(spans))
        except:
            prt('failed to calculate resume certification score')


        try:
            prt("confidence level: ", self.parameters.getConfidenceScore(text))
        except:
            prt('failed to calculate resume confidence level')
    

In [279]:
model = linguistics()
text_files = tools.loadTextData()

jdPath = "../meta_data/resume and jd/jd"
pdfs = tools.loadPdfs(jdPath)
jdtext = tools.extractTextFromPdf(pdfs, 0)

In [280]:
model.getAllScores(text_files[1], tools.extractTextFromPdf(pdfs, 2))