In [7]:
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

from spacy import displacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

import warnings 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp310-cp310-win_amd64.whl.metadata (41 kB)
     -------------------------------------- 42.0/42.0 kB 503.7 kB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 1.5/1.5 MB 2.6 MB/s eta 0:00:00
Downloading regex-2024.5.15-cp310-cp310-win_amd64.whl (268 kB)
   ---------------------------------------- 269.0/269.0 kB 5.5 MB/s eta 0:00:00
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2024.5.15
Note: you may need to restart the kernel to use updated packages.


In [8]:
data = pd.read_csv("../../survey-into-datasets/data/clean-data/Cleaned_data.csv")

In [9]:
data

Unnamed: 0.1,Unnamed: 0,Category,Resume,Cleaned_Resume
0,0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...
1,1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...,Education Details May 2013 to May 2017 BE UITR...
2,2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...
3,3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,4,Data Science,"Education Details \n MCA YMCAUST, Faridabad...",Education Details MCA YMCAUST Faridabad Haryan...
...,...,...,...,...
11019,11019,DotNet Developer,"Technical Skills â¢ Languages: C#, ASP .NET M...",Technical Skills Languages C ASP NET MVC HTML ...
11020,11020,DotNet Developer,Education Details \nJanuary 2014 Education De...,Education Details January 2014 Education Detai...
11021,11021,DotNet Developer,"Technologies ASP.NET, MVC 3.0/4.0/5.0, Unit Te...",Technologies ASPNET MVC 304050 Unit Testing En...
11022,11022,DotNet Developer,"Technical Skills CATEGORY SKILLS Language C, C...",Technical Skills CATEGORY SKILLS Language C C ...


In [10]:
data['Cleaned_Resume'] = data['Cleaned_Resume'].str.lower()

In [12]:
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
nlp = spacy.load("en_core_web_lg")

In [6]:
doc = nlp(data.Cleaned_Resume[1])
displacy.render(doc, style="ent", jupyter=True)

In [15]:
# The jobzilla skill dataset is jsonl file containing different skills that can be used to create spaCy entity_ruler.
# The data set contains label and pattern-> diferent words used to descibe skills in various resume.
skill_pattern_path = "skill_patterns.jsonl"

In [16]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [17]:
# Add entity_ruler to the pipeline if it's not already present
if "entity_ruler" not in nlp.pipe_names:
    ruler = nlp.add_pipe("entity_ruler", after="ner")
else:
    ruler = nlp.get_pipe("entity_ruler")

In [18]:
# Load skill patterns into the entity_ruler
ruler.from_disk(skill_pattern_path)

<spacy.pipeline.entityruler.EntityRuler at 0x162694602c0>

In [19]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [20]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [21]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Category,Resume,Cleaned_Resume
0,0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...,education details may 2013 to may 2017 be uitr...
2,2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...
3,3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,4,Data Science,"Education Details \n MCA YMCAUST, Faridabad...",education details mca ymcaust faridabad haryan...


In [22]:
data["skills"] = data.head(200)["Cleaned_Resume"].str.lower().apply(get_skills)
data["skills"] = data.head(200)["skills"].apply(unique_skills)
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Resume,Cleaned_Resume,skills
0,0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,"[time series, security, database, plotly, acco..."
1,1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...,education details may 2013 to may 2017 be uitr...,"[time series, outlier, feature selection, kera..."
2,2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...,"[segmentation, windows, languages, jupyter not..."
3,3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[time series, segmentation, data processing, a..."
4,4,Data Science,"Education Details \n MCA YMCAUST, Faridabad...",education details mca ymcaust faridabad haryan...,"[data analysis, data science, data structure, ..."


In [23]:
doc = nlp(data.Cleaned_Resume[2])
displacy.render(doc, style="ent", jupyter=True)

In [None]:
import tika
tika.initVM()
from tika import parser
import os
import re

In [22]:
from tika import parser

def parse_pdf_with_tika(pdf_path):
  parsed_pdf = parser.from_file(pdf_path)

  if parsed_pdf:
    return parsed_pdf["content"]
  else:
    return None

In [None]:
# Example usage
pdf_file_path = "data/cv-raw-data/CV.pdf"  # Test avec Mon CV
extracted_text = parse_pdf_with_tika(pdf_file_path)

if extracted_text:
  print("Extracted Text:")
  print(extracted_text)
else:
  print("Failed to extract text from PDF.")

In [25]:
# Dummy data
extracted_text = """John Doe

Contact Information:

Email: john.doe@example.com
Phone: +1 (555) 123-4567
LinkedIn: linkedin.com/in/johndoe
GitHub: github.com/johndoe
Address: 1234 Elm Street, Apt 101, Springfield, IL 62704
Professional Summary:

Highly skilled Software Engineer with 8+ years of experience in software development, specializing in web and mobile application development. Proficient in multiple programming languages, including Python, JavaScript, and Java. Adept at leading development teams, optimizing system performance, and implementing innovative solutions to complex problems. Seeking to leverage technical expertise and leadership skills to contribute to a dynamic development team.

Professional Experience:

Senior Software Engineer
XYZ Inc, Springfield, IL
June 2020 – Present

Lead the development and implementation of a new e-commerce platform, resulting in a 30% increase in online sales.
Designed and implemented RESTful APIs to enhance functionality and integrate third-party services.
Optimized database queries and improved system performance, reducing load times by 40%.
Mentored junior developers, conducted code reviews, and provided technical guidance.
Software Engineer
ABC Corp, Chicago, IL
May 2015 – May 2020

Developed and maintained web applications using Python, Django, and JavaScript frameworks.
Led a team of developers to migrate legacy systems to a modern architecture, enhancing scalability and maintainability.
Collaborated with cross-functional teams to gather requirements and deliver solutions that met business needs.
Implemented automated testing and CI/CD pipelines, improving code quality and deployment efficiency.
Junior Software Developer
Tech Solutions, Evanston, IL
July 2014 – April 2015

Assisted in the development of mobile applications using Java and Android SDK.
Wrote clean, maintainable code and performed debugging and troubleshooting to resolve issues.
Participated in team meetings and contributed to project planning and design discussions.
Developed unit tests to ensure code quality and reliability.
Education:

Master of Science in Computer Science
University of Illinois at Urbana-Champaign
Graduated: May 2014

Bachelor of Science in Computer Science
University of Illinois at Urbana-Champaign
Graduated: May 2012

Technical Skills:

Programming Languages: Python, JavaScript, Java, C++
Web Development: HTML, CSS, Django, Flask, React, Angular
Databases: MySQL, PostgreSQL, MongoDB
Tools & Technologies: Git, Docker, Kubernetes, Jenkins, AWS
Agile Methodologies, Test-Driven Development (TDD)
Certifications:

AWS Certified Solutions Architect – Associate
Certified ScrumMaster (CSM)
Projects:

E-Commerce Platform Development

Led the end-to-end development of a scalable e-commerce platform using Django and React.
Integrated payment gateways and third-party APIs to enhance functionality.
Mobile Health App

Developed a mobile health application using Java and Android SDK, enabling users to track their health metrics and receive personalized insights.
Professional Affiliations:

Member, Association for Computing Machinery (ACM)
Member, IEEE Computer Society
Languages:

English (Native)
Spanish (Professional Proficiency)
"""

In [26]:
extracted_text = extracted_text.replace("\n"," ")
extracted_text = extracted_text.replace("[^a-zA-Z0-9]", " ");  
re.sub('\W+','', extracted_text)
extracted_text = extracted_text.lower()
extracted_text = extracted_text.strip()
print(extracted_text)

john doe  contact information:  email: john.doe@example.com phone: +1 (555) 123-4567 linkedin: linkedin.com/in/johndoe github: github.com/johndoe address: 1234 elm street, apt 101, springfield, il 62704 professional summary:  highly skilled software engineer with 8+ years of experience in software development, specializing in web and mobile application development. proficient in multiple programming languages, including python, javascript, and java. adept at leading development teams, optimizing system performance, and implementing innovative solutions to complex problems. seeking to leverage technical expertise and leadership skills to contribute to a dynamic development team.  professional experience:  senior software engineer xyz inc, springfield, il june 2020 – present  lead the development and implementation of a new e-commerce platform, resulting in a 30% increase in online sales. designed and implemented restful apis to enhance functionality and integrate third-party services. o

In [27]:
doc = nlp(extracted_text)

In [28]:
displacy.render(doc, style="ent", jupyter=True)

### Using re

In [29]:
import re

def extract_contact_number_from_resume(text):
    contact_number = None

    # Use regex pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
#     pattern = r"\+62\s?\d{3}\s?\d{4}\s?\d{4}"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()

    return contact_number

In [30]:
extract_contact_number_from_resume(extracted_text)

'1 (555) 123-4567'

In [31]:
import re

def extract_email_from_resume(text):
    email = None

    # Use regex pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()

    return email

In [32]:
extract_email_from_resume(extracted_text)

'john.doe@example.com'

In [33]:
import re

def extract_skills_from_resume(text, skills_list):
    skills = []

    for skill in skills_list:
        pattern = r"\b{}\b".format(re.escape(skill))
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            skills.append(skill)

    return skills

In [34]:
extract_skills_from_resume(extracted_text, ['Python', 'Data Analysis', 'Machine Learning', 'Communication', 'Project Management', 'Deep Learning', 'SQL', 'Tableau'])

['Python']

In [36]:
import re

def extract_education_from_resume(text):
    education = []

    # List of education keywords to match against
    education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master', 'Undergraduate']

    for keyword in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(keyword))
        match = re.search(pattern, text)
        if match:
            education.append(match.group())

    return education

In [37]:
extract_education_from_resume(extracted_text)

['bachelor', 'master']

In [38]:
import spacy
from spacy.matcher import Matcher

def extract_name(resume_text):
    nlp = spacy.load('en_core_web_lg')
    matcher = Matcher(nlp.vocab)

    # Define name patterns
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # First name, Middle name, Middle name, and Last name
        # Add more patterns as needed
    ]

    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])

    doc = nlp(resume_text)
    matches = matcher(doc)

    # Function to merge overlapping matches
    def merge_matches(matches, doc):
        merged_spans = []
        for match_id, start, end in matches:
            span = doc[start:end]
            if not merged_spans or merged_spans[-1].end <= span.start:
                merged_spans.append(span)
            else:
                # Merge with the previous span if they overlap
                merged_spans[-1] = doc[merged_spans[-1].start:span.end]
        return merged_spans

    # Merge overlapping matches
    merged_matches = merge_matches(matches, doc)

    for span in merged_matches:
        return span.text

    return None

In [40]:
extract_name(extracted_text)

'john doe'