Spacy Train Dataset

In [None]:
import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)


convert("en", TRAIN_DATA, "/content/train.spacy")
convert("en", TRAIN_DATA, "/content/valid.spacy")

In [None]:
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py

In [None]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl (236 kB)
[2K   [90m━━━━━━━━━━

In [None]:
!python -m spacy init fill-config /content/base_config.cfg /content/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy debug data config.cfg

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: tok2vec, ner
50000 training docs
50000 evaluation docs
[38;5;3m⚠ 50000 training examples also in evaluation data[0m
[1m
[38;5;4mℹ 2917000 total word(s) in the data (522 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 5 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities crossing sentence boundaries[0m
[1m
[38;5;2m✔ 6 checks passed[0m


In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./valid.spacy

[38;5;4mℹ No output directory provided[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     31.27    0.07    0.22    0.04    0.00
  0     200        422.10   1412.85   95.52   95.74   95.30    0.96
  0     400        446.02    187.21   99.35   99.52   99.19    0.99
  0     600        107.48     41.60   99.63   99.76   99.49    1.00
  0     800         22.51     19.01   99.65   99.80   99.51    1.00
  0    1000         84.91     70.16   99.65   99.80   99.51    1.00
  0    1200         77.03     65.73   99.14   99.22   99.06    0.99
  0    1400        112.41     61.15   99.65   99.80   99.51    1.00
  0    1600        466.92     79.78   99.38   99.52   99.24    0.99
  0    1800        519.38    111.04   99.64   99.80 

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!mv /content/model-best /content/drive/MyDrive/Umar

In [None]:
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py

In [None]:
import spacy
import numpy as np
from spacy import displacy

# Load the trained SpaCy model
trained_nlp = spacy.load("/content/drive/MyDrive/Umar/model-best")

# Define job description and resume
job = """Looking for a web developer with expertise in python programming."""
resume = """I am experienced web developer with skills Python, Java and Tensorflow."""

# Process job description and resume
job_description = trained_nlp(job)
resume_summary = trained_nlp(resume)

# Function to extract unique entities of all labels
def get_unique_entities(doc):
    unique_entities = set()
    for ent in doc.ents:
        unique_entities.add(ent.text.lower())  # Use lowercase for consistency
    return list(unique_entities)

# Extract unique entities from job and resume (all entity types)
job_entities = get_unique_entities(job_description)
resume_entities = get_unique_entities(resume_summary)

# Display entities with displaCy for visualization
print("Job Description Entities (Visualized):")
displacy.render(job_description, style="ent", jupyter=True)

print("\nResume Summary Entities (Visualized):")
displacy.render(resume_summary, style="ent", jupyter=True)

# Print unique entities
print("\nUnique Job Entities (All Types):", job_entities)
print("Unique Resume Entities (All Types):", resume_entities)


Job Description Entities (Visualized):



Resume Summary Entities (Visualized):



Unique Job Entities (All Types): ['expertise', 'web developer']
Unique Resume Entities (All Types): ['skills python', 'tensorflow']


.

.

.


**Cosine Similarity**

In [None]:
# Use these entities for cosine similarity, focusing on all entities for broader matching
def compute_entity_similarity(job_ents, resume_ents, nlp):
    if not job_ents or not resume_ents:
        return 0.0, [], []  # Return 0 similarity if no entities

    # Get vectors for all entities
    job_ent_vecs = [nlp(ent).vector for ent in job_ents]
    resume_ent_vecs = [nlp(ent).vector for ent in resume_ents]

    # Compute average vectors for all entities
    job_avg_vec = np.mean(job_ent_vecs, axis=0)
    resume_avg_vec = np.mean(resume_ent_vecs, axis=0)

    # Calculate cosine similarity
    if np.linalg.norm(job_avg_vec) == 0 or np.linalg.norm(resume_avg_vec) == 0:
        return 0.0, job_ents, resume_ents  # Handle zero vectors
    cos_sim_score = np.dot(resume_avg_vec, job_avg_vec) / (
        np.linalg.norm(resume_avg_vec) * np.linalg.norm(job_avg_vec)
    )

    return cos_sim_score, job_ents, resume_ents

# Compute and print cosine similarity for all entities
similarity_score, unique_job_ents, unique_resume_ents = compute_entity_similarity(job_entities, resume_entities, trained_nlp)
print(f"\nCosine Similarity between all entities: {similarity_score:.4f}")

# Semantic interpretation based on similarity threshold
if similarity_score > 0.5:
    print("Semantic Match: High similarity detected for entities.")
else:
    print("Semantic Mismatch: Low similarity between entities.")


Cosine Similarity between all entities: 0.7839
Semantic Match: High similarity detected for entities.


.

.

.

.

**Semantic analysis**

In [None]:
!python -m spacy download en_core_web_sm spacy


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
from spacy.tokens import DocBin
import random

nlp_pretrained = spacy.load("en_core_web_sm")
print(len(TRAIN_DATA))
TRAIN_DATA = TRAIN_DATA
def augment_with_dependencies(data):
    i=0
    augmented_data = []
    for text, annot in data:
        print(i)
        i+=1
        doc = nlp_pretrained(text)
        entities = annot["entities"]
        deps = [(token.i, token.dep_, token.head.i) for token in doc]
        augmented_data.append((text, {"entities": entities, "deps": deps}))
    return augmented_data

TRAIN_DATA_WITH_DEPS = augment_with_dependencies(TRAIN_DATA)

random.seed(42)
random.shuffle(TRAIN_DATA_WITH_DEPS)
train_size = int(0.8 * len(TRAIN_DATA_WITH_DEPS))
train_data = TRAIN_DATA_WITH_DEPS[:train_size]
valid_data = TRAIN_DATA_WITH_DEPS[train_size:]

nlp = spacy.blank("en")
doc_bin_train = DocBin()
doc_bin_valid = DocBin()

for text, annot in train_data:
    doc = nlp.make_doc(text)
    doc.ents = [doc.char_span(start, end, label) for start, end, label in annot["entities"]]
    for token_idx, dep, head_idx in annot["deps"]:
        doc[token_idx].dep_ = dep
        doc[token_idx].head = doc[head_idx]
    doc_bin_train.add(doc)

for text, annot in valid_data:
    doc = nlp.make_doc(text)
    doc.ents = [doc.char_span(start, end, label) for start, end, label in annot["entities"]]
    for token_idx, dep, head_idx in annot["deps"]:
        doc[token_idx].dep_ = dep
        doc[token_idx].head = doc[head_idx]
    doc_bin_valid.add(doc)

doc_bin_train.to_disk("/content/semantics_train.spacy")
doc_bin_valid.to_disk("/content/semantics_valid.spacy")

print("Dataset for semantic analysis created and saved as .spacy files.")

50000
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275


KeyboardInterrupt: 

In [None]:
import spacy
from spacy.tokens import DocBin

doc_bin = DocBin().from_disk("/content/semantics_train.spacy")
nlp = spacy.blank("en")
docs = list(doc_bin.get_docs(nlp.vocab))

for doc in docs[:2]:  # First 5 examples
    print("Text:", doc.text)
    print("Entities:", [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
    print("Dependencies:")
    for token in doc:
        print(f" - {token.text}: dep={token.dep_}, head={token.head.text}")

Text: Full-time role in Dublin . Serious applicants only! Looking for an Scala Developer with Diploma in Physics degree and 5.5 years knowledge of Unity . Requirements: Expertise in Unity , conversion optimization skills . 📩 Email resume and portfolio to careers@digital.io by 05-11-2025 with subject "Scala Developer" .
Entities: [('Dublin', 18, 24, 'LOC'), ('Scala Developer', 67, 82, 'JOBTITLE'), ('Diploma in Physics', 88, 106, 'EDUCATION'), ('5.5 years', 118, 127, 'EXPERIENCE'), ('Unity', 141, 146, 'TECH'), ('Unity', 176, 181, 'TECH')]
Dependencies:
 - Full: dep=amod, head=time
 - -: dep=punct, head=time
 - time: dep=compound, head=role
 - role: dep=ROOT, head=role
 - in: dep=prep, head=role
 - Dublin: dep=pobj, head=in
 - .: dep=punct, head=role
 - Serious: dep=amod, head=applicants
 - applicants: dep=ROOT, head=applicants
 - only: dep=advmod, head=applicants
 - !: dep=punct, head=applicants
 - Looking: dep=ROOT, head=Looking
 - for: dep=prep, head=Looking
 - an: dep=det, head=Develo

In [None]:
!pip install spacy-lookups-data



In [None]:
!python -m spacy init fill-config /content/semantics_base_config.cfg /content/semantics_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/semantics_config.cfg
You can now add your data and train your pipeline:
python -m spacy train semantics_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy init fill-config /content/semantics_base_config.cfg semantics_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
semantics_config.cfg
You can now add your data and train your pipeline:
python -m spacy train semantics_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy debug data semantics_config.cfg

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: tok2vec, parser
40000 training docs
10000 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[1m
[38;5;4mℹ 2334743 total word(s) in the data (522 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ Found 219990 sentence(s) with an average length of 10.6 words.[0m
[38;5;4mℹ 29 label(s) in train data[0m
[38;5;4mℹ 29 label(s) in projectivized train data[0m
[1m
[38;5;2m✔ 3 checks passed[0m


In [None]:
!python -m spacy train /content/semantics_config.cfg --output ./semantics_model --verbose

[38;5;2m✔ Created output directory: semantics_model[0m
[38;5;4mℹ Saving to output directory: semantics_model[0m
[38;5;4mℹ Using CPU[0m
[1m
[2025-03-02 23:55:59,299] [INFO] Set up nlp object from config
[2025-03-02 23:55:59,457] [DEBUG] Loading corpus from path: /content/semantics_valid.spacy
[2025-03-02 23:55:59,460] [DEBUG] Loading corpus from path: /content/semantics_train.spacy
[2025-03-02 23:55:59,460] [INFO] Pipeline: ['tok2vec', 'parser']
[2025-03-02 23:55:59,468] [DEBUG] Loading lookups from spacy-lookups-data: ['lexeme_norm']
[2025-03-02 23:55:59,485] [INFO] Added vocab lookups: lexeme_norm
[2025-03-02 23:55:59,486] [INFO] Created vocabulary
[2025-03-02 23:56:00,533] [INFO] Added vectors: en_core_web_sm
[2025-03-02 23:56:00,534] [INFO] Finished initializing nlp object
[2025-03-02 23:57:01,189] [INFO] Initialized pipeline components: ['tok2vec', 'parser']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2025-03-02 23:57:01,219] [DEBUG] Loading corpus from path: /content/semantic

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!mv /content/semantics_model /content/drive/MyDrive/Umar

In [None]:
import spacy
import numpy as np
from spacy import displacy

# Load NER + similarity model
nlp_ner = spacy.load("/content/drive/MyDrive/Umar/model-best")

# Load semantic analysis model (if trained with parser)
nlp_semantic = spacy.load("/content/drive/MyDrive/Umar/semantics_model/model-best")

# Resume and job description
resume_text = """
Jane Smith
Data Scientist
Los Angeles, CA 90001
jane.smith@email.com | (555) 123-4567

Professional Summary
Accomplished Data Scientist with 4 years of experience in Python, R, and TensorFlow, based in Los Angeles. Proficient in building machine learning models and analyzing big data with Hadoop. Seeking to leverage expertise in data-driven solutions for innovative projects.

Education
Master of Science in Data Science
University of California, Berkeley, CA
Graduated: May 2019

Bachelor of Science in Statistics
Harvard University, Cambridge, MA
Graduated: May 2017

Professional Experience
Senior Data Scientist
Amazon, Seattle, WA
July 2020 - Present
- Developed predictive models using Python and TensorFlow, increasing accuracy by 15%.
- Analyzed large datasets with Hadoop and Spark, reducing processing time by 20%.
- Collaborated with teams in Los Angeles and New York to deploy AI solutions on AWS.

Data Analyst
Google, Mountain View, CA
June 2017 - June 2020
- Built data pipelines using R and SQL, improving reporting efficiency by 25%.
- Utilized PyTorch for deep learning projects, achieving 90% model performance.
- Worked remotely from San Francisco for 1.5 years, focusing on Python-based analytics.

Technical Skills
Programming Languages: Python, R, JavaScript, SQL
Frameworks/Libraries: TensorFlow, PyTorch, Hadoop, Spark
Databases: PostgreSQL, MongoDB
Cloud Platforms: AWS, Azure
Tools: Git, Docker, Jupyter Notebook

Certifications
Certified Machine Learning Engineer
TensorFlow, Issued: August 2021

Google Data Analytics Certificate
Google, Issued: March 2020

Languages
English (Native), Spanish (Intermediate)

Projects
Customer Sentiment Analysis Tool
- Developed with Python and TensorFlow, hosted on AWS, completed in Los Angeles, June 2022.
- Integrated R scripts for statistical analysis, improving prediction accuracy by 18%.

Big Data Processing Pipeline
- Built using Hadoop and Spark, deployed in Seattle, completed in December 2021.
- Achieved 30% performance boost with PyTorch optimization.
"""

job_text = """
Remote full-time position. Urgent hire needed! Looking for a Data Scientist with a Master of Science degree and 3+ years of experience in Python and TensorFlow, based in Los Angeles. Requirements: Expertise in Python, strong analytical skills with Hadoop, and proficiency in Spark. 📩 Apply by emailing your resume to careers@company.com with subject "Data Scientist" by 15-11-2025.
"""

# Process with NER model
resume_doc_ner = nlp_ner(resume_text)
job_doc_ner = nlp_ner(job_text)

# Function to extract unique entities of all labels
def get_unique_entities(doc):
    unique_entities = set()
    for ent in doc.ents:
        unique_entities.add(ent.text.lower())  # Use lowercase for consistency
    return list(unique_entities)

# Extract unique entities
job_entities = get_unique_entities(job_doc_ner)
resume_entities = get_unique_entities(resume_doc_ner)

# Display entities with displaCy
print("Job Description Entities (Visualized):")
displacy.render(job_doc_ner, style="ent", jupyter=True)

print("\nResume Entities (Visualized):")
displacy.render(resume_doc_ner, style="ent", jupyter=True)

print("\nUnique Job Entities (All Types):", job_entities)
print("Unique Resume Entities (All Types):", resume_entities)

# Compute cosine similarity for all entities (focusing on TECH for relevance)
def compute_entity_similarity(job_ents, resume_ents, nlp):
    if not job_ents or not resume_ents:
        return 0.0, [], []
    job_ent_vecs = [nlp(ent).vector for ent in job_ents]
    resume_ent_vecs = [nlp(ent).vector for ent in resume_ents]
    job_avg_vec = np.mean(job_ent_vecs, axis=0)
    resume_avg_vec = np.mean(resume_ent_vecs, axis=0)
    if np.linalg.norm(job_avg_vec) == 0 or np.linalg.norm(resume_avg_vec) == 0:
        return 0.0, job_ents, resume_ents
    cos_sim_score = np.dot(resume_avg_vec, job_avg_vec) / (
        np.linalg.norm(resume_avg_vec) * np.linalg.norm(job_avg_vec)
    )
    return cos_sim_score, job_ents, resume_ents

similarity_score, _, _ = compute_entity_similarity(job_entities, resume_entities, nlp_ner)
print(f"\nCosine Similarity between all entities: {similarity_score:.4f}")
if similarity_score > 0.5:
    print("Semantic Match: High similarity detected for entities.")
else:
    print("Semantic Mismatch: Low similarity between entities.")

# Semantic analysis with parser (if available)
if "parser" in nlp_semantic.pipe_names:
    resume_doc_sem = nlp_semantic(resume_text)
    job_doc_sem = nlp_semantic(job_text)

    print("\nResume Semantic Relationships:")
    for sent in resume_doc_sem.sents:
        for token in sent:
            if token.text.lower() in [ent.lower() for ent in job_entities + resume_entities if ent.lower() in ["python", "tensorflow", "hadoop", "spark", "r"]]:  # Focus on TECH
                print(f" - Token: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")

    print("\nJob Description Semantic Relationships:")
    for sent in job_doc_sem.sents:
        for token in sent:
            if token.text.lower() in [ent.lower() for ent in job_entities + resume_entities if ent.lower() in ["python", "tensorflow", "hadoop", "spark"]]:  # Focus on TECH
                print(f" - Token: {token.text}, Dependency: {token.dep_}, Head: {token.head.text}")

Job Description Entities (Visualized):



Resume Entities (Visualized):



Unique Job Entities (All Types): ['hadoop', 'spark', 'los angeles', 'tensorflow', 'a master of science', '3+ years', 'python', 'data scientist']
Unique Resume Entities (All Types): ['javascript', 'teams', 'certified machine', 'docker', 'azure', 'data processing', 'native)', 'jane smith', 'hadoop', '90% model', 'education\n', 'new york to', 'jupyter notebook', 'sql,', 'statistics', 'berkeley', 'seattle', 'tensorflow,', 'harvard university', 'professional experience\nsenior data scientist', 'bachelor of science', 'los angeles', 'tensorflow', 'r', 'intermediate', 'google data', 'google', 'master of science', '4 years', 'accomplished data', 'spark', 'aws', '1.5 years', 'projects\ncustomer sentiment', 'python', 'university of california,', 'certifications\n', 'professional summary', 'december 2021', 'amazon', 'data', 'pytorch']

Cosine Similarity between all entities: 0.9694
Semantic Match: High similarity detected for entities.

Resume Semantic Relationships:
 - Token: Python, Dependency: