# Env

In [2]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [375]:
import io
page = ""
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            fake_file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                fake_file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()

# calling above function and extracting text
file_path = "C:/Users/DS/Desktop/Text Mining/gen-cover-letter-master/cover_letters/HAMDANI HAMZA - CV.pdf"
for page in extract_text_from_pdf(file_path):
    page += ' ' + page

In [376]:
page

'H A M D A N I   H A M Z A\nE T U D I A N T   D ATA   S C I E N C E\n\nCosmos 3 Rue Ibnou Hazem Nr 21 \nAppt 3 La Villette, Hay Mohammadi.\n20250, Casablanca, Maroc\n\n\uf0e0\n\nha.h.hamdani@gmail.com\n\nE D U C AT I O N\n\nC O N TA C T\n\n\uf095\n\n+212 679 450 336\n\n\uf08c\n\nlinkedin.com/hamza-hamdani\n\n2018\n2020\n\n2016\n2014\n\n2018\n2017\n\nAvril\nMai\n2016\n\nMASTER SPÉCIALISÉ - DATA SCIENCE \n(EN COURS).\nUCA : Faculté des Sciences Semlalia - Marrakech, Maroc.\n\nDUT - MULTIMÉDIA ET CONCEPTION \nWEB.\nUAE : Ecole Normale Supérieure - Tétouan, Maroc.\n\nE X P E R I E N C E S\n\nFREELANCE, CASABLANCA.\nDesign Graphique ( Logos, Affiches, CV, ... ).\nDéveloppeur Laravel, Symfony, Java et Mobile ( Ionic & \nAngularJs ).\n\nASSOCIATION HANANE, TÉTOUAN.\nStage PFE.\nDéveloppement du site web de l’association.\nConception d’une base de données.\nWebdesign (maquettes, typographie, mouvements \ngraphiques, charte graphique).\nDéveloppement front : HTML5 + CSS3 + JQuery\nDéveloppement

In [377]:
import docx2txt

file_path = "C:/Users/DS/Desktop/Text Mining/gen-cover-letter-master/cover_letters/BENLAARAJ-Oumaima.docx"

def extract_text_from_doc(doc_path):
    temp = docx2txt.process(doc_path)
    text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    return ' '.join(text)

wordtext = extract_text_from_doc(file_path)

In [378]:
wordtext

"BENLAARAJ Oumaima                                                                                                                                                                                                         Age: 23 years old C.I.N: EE 723085 GSM: 0665760443   E-mail: benlaarajoum@gmail.com                    Education 2018-2020 :  Cadi\xa0Ayyad\xa0University - Faculty of Sciences Semlalia, Marrakech                        specialized master in data science 2017-2018: Mohammed V University -National School of Computer Science and Systems Analysis, Rabat.                       Research Master, 'Data Sciences and Big Data' 2016-2017: Hassan 1st University -Faculty of Science and Techniques, Settat.                       Bachelor, 'Big Data: Management and Analysis of Big Data' 2013-2015: Specialized Institute of Management and Computer Science, Marrakech                        Specialized Technician Diploma 2012-2013: HASSAN 2 high school, Marrakech.                      Bacca

# Extract Names

In [379]:
import spacy
from spacy.matcher import Matcher

# load pre-trained model
nlp = spacy.load('./model5')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}]]
    
    matcher.add('NAME', None, *pattern)
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return [span.text]

In [381]:
extract_name(wordtext)

['BENLAARAJ Oumaima']

# Extract Phone Number

In [207]:
import re

def extract_mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)

    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return ['+' + number]
        else:
            return [number]

In [382]:
extract_mobile_number(wordtext)

['066576044']

# Extract Email Adress

In [208]:
import re

def extract_email(email):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
    if email:
        try:
            return [email[0].split()[0].strip(';')]
        except IndexError:
            return None

In [383]:
extract_email(wordtext)

['benlaarajoum@gmail.com']

# Extract Skills

In [45]:
# NER
# Collect Skills Data

In [387]:
import pandas as pd
import spacy
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

# load pre-trained model
nlp = spacy.load('model5')
# noun_chunks = nlp.noun_chunks


def convertTuple(tup): 
    str =  ' '.join(tup) 
    return str

def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    tokens = word_tokenize(resume_text)
    bigrams = ngrams(tokens,2)
    trigrams = ngrams(tokens,3)
        
    # reading the csv file    
    skills = []
    with open("skills.txt") as file_in:
        for line in file_in:
            skills.append(line.replace("\n", ""))
                
    skillset = []
    # check for one-grams (example: python)
    for token in tokens:
        if token in skills:
            skillset.append(token)
        elif token.lower() in skills:
            skillset.append(token)
        elif token.upper() in skills:
            skillset.append(token)

            
    # check for bi-grams and tri-grams (example: machine learning)
    for token in bigrams:
        token = convertTuple(token)
        if token in skills:
            skillset.append(token)
        elif token.lower() in skills:
            skillset.append(token)
        elif token.upper() in skills:
            skillset.append(token)
                            
                
    for token in trigrams:
        token = convertTuple(token)
        if token in skills:
            skillset.append(token)
        elif token.lower() in skills:
            skillset.append(token)
        elif token.upper() in skills:
            skillset.append(token)
                            
    return [i.capitalize() for i in set([i.lower() for i in skillset])]

In [390]:
extract_skills(wordtext)

['Ai', 'Big data', 'Database', 'Data mining', 'Mathematics']

# Extract Universities

In [393]:
import spacy
from spacy.matcher import Matcher

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_universities(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [[{'POS': 'PROPN'}, {'POS': 'ORG'}]]
    
    matcher.add('UNI', None, *pattern)
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return [span.text]

In [395]:
extract_universities(wordtext)

In [396]:
import json

with open("data2.json","r") as file:
    data = json.load(file)

In [397]:
data

[['Bachelor of Commerce (Honors) in Human Resources Management, HKBU',
  {'entities': [[0, 20, 'TITLE']]}],
 ['Pétrole du Maghreb', {'entities': [[0, 18, 'ORG']]}],
 ['March 2010 – Present KPMG (Singapore)',
  {'entities': [[0, 20, 'DATE'], [21, 25, 'ORG']]}],
 ['aouzal.imane1994@gmail.com', {'entities': [[0, 26, 'EMAIL']]}],
 ['0700351466', {'entities': [[0, 10, 'PHONE']]}],
 ['(Office national des chemins de fer)', {'entities': [[1, 35, 'ORG']]}],
 ['(Douja Promotion Addoha)', {'entities': [[1, 23, 'ORG']]}],
 ['Master  March 2019 – September 2019 Rabat – Morocco',
  {'entities': [[8, 35, 'DATE'], [36, 51, 'CITY']]}],
 ['Languages:\tCantonese, Mandarin and English',
  {'entities': [[11, 20, 'LANGUAGE'],
    [22, 30, 'LANGUAGE'],
    [35, 42, 'LANGUAGE']]}],
 ['Mobile No.\t0176775531', {'entities': [[11, 21, 'PHONE']]}],
 ['1994', {'entities': [[0, 4, 'DATE']]}],
 ['Hor Kew Private Limited, a subsidiary of Hor Kew Corporation Limited',
  {'entities': [[0, 23, 'ORG'], [41, 68, 'ORG']]}

In [408]:
for row in data:
    try:
        if row[1]['entities'][0][2] == "EMAIL" or row[1]['entities'][0][2] == "PHONE" or row[1]['entities'][0][2] == "ORG"  or row[1]['entities'][0][2] == "DATE" or row[1]['entities'][0][2] == "INSTITUTION" or row[1]['entities'][0][2] == "LANGUAGE" or row[1]['entities'][0][2] == "CITY":
            print(row)
            data.remove(row)
    except:
        pass
    

['Languages:\tCantonese, Mandarin and English', {'entities': [[11, 20, 'LANGUAGE'], [22, 30, 'LANGUAGE'], [35, 42, 'LANGUAGE']]}]
['Janus Henderson Investors, Hong Kong Aug 2015 – Present', {'entities': [[37, 55, 'DATE']]}]
['ASSOCIATION Maroc Générations', {'entities': [[0, 29, 'ORG']]}]
['Temasek Polytechnic\tApr 2009 – Apr 2012', {'entities': [[0, 19, 'INSTITUTION'], [20, 39, 'DATE']]}]
['SETTAT', {'entities': [[0, 6, 'CITY']]}]
['Telephone +852 6177 1146 (HK)', {'entities': [[10, 24, 'PHONE']]}]
['Association bassin de Safi', {'entities': [[0, 26, 'ORG']]}]
['Period : Sep’13 – Dec’15', {'entities': [[9, 24, 'DATE']]}]
['( Citco Fund Services Singapore Pte Ltd )', {'entities': [[2, 39, 'ORG']]}]
['MARTIL', {'entities': [[0, 6, 'CITY']]}]
['2017 - Asset Protection Using Offshore Companies Certificate, Udemy.com online learning platform', {'entities': [[0, 4, 'DATE'], [30, 60, 'TITLE']]}]
['Sep 2015 - Jun 2016\t\tRockwills Trustee Berhad', {'entities': [[0, 19, 'DATE'], [21, 45, 'ORG'

In [410]:
len(data)

416

In [313]:
import random

TRAIN_DATA = data
nlp1 = spacy.load("en_core_web_sm")

if 'ner' not in nlp.pipe_names:
    ner = nlp1.create_pipe('ner')
    nlp1.add_pipe(ner, last=True)
else:
    ner = nlp1.get_pipe("ner")
    
for _,annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2].upper())
    
other_pipes = [pipe for pipe in nlp1.pipe_names if pipe != 'ner']
with nlp1.disable_pipes(*other_pipes):
    optimizer = nlp1.begin_training()
    for itn in range(10):
        print("Starting iteration :",itn+1)
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp1.update([text], [annotations], sgd=optimizer, drop=0.2, losses=losses)
        print(losses)

Starting iteration : 1
{'ner': 2885.2769186037335}
Starting iteration : 2
{'ner': 2616.460268894662}
Starting iteration : 3


KeyboardInterrupt: 

In [24]:
from spacy import displacy

displacy.render(nlp1(str(wordtext)), jupyter=True, style='ent')

NameError: name 'nlp1' is not defined

# Extraxt Education

In [400]:
import re
import spacy
from nltk.corpus import stopwords

# load pre-trained model
nlp = spacy.load('model5')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BACCALAUREATE', 'BACCALAUREAT', 'LICENCE PROFESSIONELLE', 'LICENCE FONDAMENTALE', 'LP', 'LF'
            'BE','B.E.', 'B.E', 'BS', 'B.S', 'BACHELOR', "BACHELOR'S", 'M'
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 'MASTER', "MASTER'S"
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 'DUT', 'DEUG', 'BTS', 'DTS',
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
            ]

def extract_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.string.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education

In [401]:
extract_education(wordtext)

[('master', '2017'), ('Master', '2016'), ('Bachelor', '2013'), 'Baccalaureate']

In [243]:
def del_nones(dfObj):
    for col in dfObj:
        dfObj[col] = dfObj[col].astype(str).str.replace(u'None',u'--')
        dfObj[col] = dfObj[col].astype(str).str.replace(u'nan',u'--')
    return dfObj

# Final Test

In [402]:
def all_together(file_path):
    if file_path.endswith('.pdf'):
        for page in extract_text_from_pdf(file_path):
            page += ' '+page
    elif file_path.endswith('.docx'):
        page = extract_text_from_doc(file_path)
        
    name, phone, email, edu, skills = extract_name(page), extract_mobile_number(page), extract_email(page), extract_education(page), extract_skills(page)

    if name == None:
        name = '--'
    if phone == None:
        phone = '--'
    if email == None:
        email = '--'
    if edu == None:
        edu = '--'
    if skills == None:
        skills = '--'
    
    dfObj = pd.DataFrame([name, phone, email, edu, skills], index=['Name', 'Phone Number', 'Email Adress', 'Diploma', 'Skills']).T
    
    return del_nones(dfObj), page, [name, phone, email, edu, skills]

df, page, listall = all_together(file_path_doc)
# dfObj = pd.DataFrame(all_together(file_path_doc), index=['Name', 'Phone Number', 'Email Adress', 'Education', 'Skills']).T
df

Unnamed: 0,Name,Phone Number,Email Adress,Diploma,Skills
0,BENLAARAJ Oumaima,066576044,benlaarajoum@gmail.com,"('master', '2017')",Ai
1,--,--,--,"('Master', '2016')",Big data
2,--,--,--,"('Bachelor', '2013')",Database
3,--,--,--,Baccalaureate,Data mining
4,--,--,--,--,Mathematics


In [307]:
file_path_pdf = "C:/Users/DS/Desktop/Text Mining/gen-cover-letter-master/cover_letters/CV-HAMDANI-HAMZA.pdf"
file_path_doc = "C:/Users/DS/Desktop/Text Mining/gen-cover-letter-master/cover_letters/BENLAARAJ-Oumaima.docx"

listall = []
df, page, listall = all_together(file_path_pdf)
df

Unnamed: 0,Name,Phone Number,Email Adress,Education,Skills
0,H A,-,ha.h.hamdani@gmail.com,MASTER,Machine learning
1,--,-,--,DUT,R
2,--,--,--,X,Python
3,--,--,--,LP,Data mining
4,--,--,--,BACCALAUREAT,Mysql
5,--,--,--,--,Cnn
6,--,--,--,--,Html
7,--,--,--,--,Java
8,--,--,--,--,Oracle
9,--,--,--,--,Php


In [289]:
import spacy
from spacy import displacy

nlpp = spacy.load('model5')

displacy.render(nlpp(str(page)), jupyter=True, style='ent')

In [368]:
import pandas as pd

doc_to_test=nlpp(page)
d={}
for ent in doc_to_test.ents:
    d[ent.label_]=[]
for ent in doc_to_test.ents:
    d[ent.label_].append(ent.text)

title = []
inst = []
newlist = []
for val in set(d['TITLE']):
    title.append(val)
for val in set(d['INSTITUTION']):
    inst.append(val)

df['Title'] = pd.Series(title)
df['Institution'] = pd.Series(inst)
del_nones(df)

Unnamed: 0,Name,Phone Number,Email Adress,Diploma,Skills,Diploma Title,Institution
0,BENLAARAJ Oumaima,066576044,benlaarajoum@gmail.com,"('master', '2017')",Ai,Research Master,-National School of Computer Science and Syste...
1,--,--,--,"('Master', '2016')",Big data,specialized master in data science,Specialized Institute of Management and Comput...
2,--,--,--,"('Bachelor', '2013')",Database,Data Sciences and Big Data,HASSAN 2 high school
3,--,--,--,Baccalaureate,Data mining,Specialized Technician Diploma,-Faculty of Science and Techniques
4,--,--,--,--,Mathematics,Internships in call centers,--


In [411]:
def ner_assist(df, page):
    nlp_model = spacy.load('model5')
    doc_to_test = nlpp(page)
    d={}
    entities = []
    for ent in doc_to_test.ents:
        d[ent.label_] = []
        entities.append(ent.label_)
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)

    title = []
    inst = []
    skills = []
    newlist = []

    if "TITLE" in entities:
        for val in set(d["TITLE"]):
            title.append(val)
    if "INSTITUTION" in entities:
        for val in set(d["INSTITUTION"]):
            inst.append(val)

    df['Diploma Title'] = pd.Series(title)
    df['Institution'] = pd.Series(inst)
    
    return del_nones(df)

In [412]:
cv_path = "C:/Users/DS/Desktop/Text Mining/gen-cover-letter-master/cover_letters/Cv_Mouad.pdf"
cover_path = "C:/Users/DS/Desktop/Text Mining/gen-cover-letter-master/cover_letters/Cover Letter-Babbleset-Shaun Grist.pdf"

def Analyse_CV_coverletter(cv, cover):
    df_cv, page_cv, listall = all_together(cv)
    df_cv = ner_assist(df_cv, page_cv)
    
    df_cover, page_cover, listall = all_together(cover)
    df_cover = ner_assist(df_cover, page_cover)
    return pd.concat([df_cv, df_cover])

Analyse_CV_coverletter(cv_path, cover_path)

Unnamed: 0,Name,Phone Number,Email Adress,Diploma,Skills,Diploma Title,Institution
0,Master Spécialisé,-,mouadmarmouchi@gmail.com,Master,Machine learning,Baccalauréat en,ESTS :
1,--,-,--,MS,Nltk,"Master Spécialisé, Sciences des Données",Lycée Charif Al Idrissi- Safi
2,--,--,--,--,Pl/sql,DUT-,Faculté des Sciences Semlalia
3,--,--,--,--,Mysql,Licence Professionelle - Métiers de,Ecole Supérieure de Téchnologie
4,--,--,--,--,Css,--,FSSM :
5,--,--,--,--,Python,--,--
6,--,--,--,--,Keras,--,--
7,--,--,--,--,Deep learning,--,--
8,--,--,--,--,Html,--,--
9,--,--,--,--,Java,--,--
