In [3]:
pip install gradio PyMuPDF

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloadin

In [4]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [5]:
# Lemmatizing the text
# Importing the required libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize

In [6]:
# Download required resources
nltk.download('wordnet')                 # WordNet dictionary
nltk.download('omw-1.4')                 # Multilingual WordNet support
nltk.download('averaged_perceptron_tagger')  # POS tagger
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [7]:
def remove_punctuations(sentence):
    import re
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

In [8]:
def preprocess(sentence):
  sentence = sentence.lower()
  sentence = remove_punctuations(sentence)
  return sentence

In [9]:
# When passed a word it will give it's POS using wordnet
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [10]:
# Lemmatizing the sentence
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    sentence = preprocess(sentence)
    res = []
    sen = nltk.sent_tokenize(sentence)
    for j in sen:
        j = remove_punctuations(j)
        words = nltk.word_tokenize(j)
        lemmatized = [lemmatizer.lemmatize(w, pos=get_wordnet_pos(w)) for w in words]
        res.extend(lemmatized)
    return res

In [14]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = tf.range(max_len)[:, tf.newaxis]
        i = tf.range(d_model)[tf.newaxis, :]
        # Cast 'i' to tf.float32 to match the dtype of tf.pow result
        angle_rates = 1 / tf.pow(10000., (2 * (tf.cast(i, tf.float32) // 2)) / tf.cast(d_model, tf.float32))
        angle_rads = tf.cast(pos, tf.float32) * angle_rates

        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        self.pos_encoding = tf.concat([sines, cosines], axis=-1)
        self.pos_encoding = self.pos_encoding[tf.newaxis, ...]

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

In [15]:
class PositionalEncoding_1(layers.Layer):
    def __init__(self, max_len, d_model):
        super(PositionalEncoding_1, self).__init__()
        pos = tf.range(max_len, dtype=tf.float32)[:, tf.newaxis]
        i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]

        angle_rates = 1 / tf.pow(10000.0, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        angle_rads = pos * angle_rates

        # Apply sin to even indices in the array; cos to odd indices
        pos_encoding = tf.where(i % 2 == 0, tf.sin(angle_rads), tf.cos(angle_rads))
        self.pos_encoding = pos_encoding[tf.newaxis, ...]

    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:, :seq_len, :]

In [None]:
def TransformerEncoderClassifier(vocab_size, d_model, num_heads, d_ff, num_layers, num_classes, max_len):
    inputs = layers.Input(shape=(None,))
    x = layers.Embedding(vocab_size, d_model)(inputs)
    x = PositionalEncoding(max_len, d_model)(x)

    for _ in range(num_layers):
        x = layers.MultiHeadAttention(num_heads, d_model // num_heads)(x, x)
        x = layers.LayerNormalization()(x)
        x_ff = layers.Dense(d_ff, activation='relu')(x)
        x = layers.Dense(d_model)(x_ff) + x
        x = layers.LayerNormalization()(x)

    encoder_output = x
    x = layers.GlobalAveragePooling1D()(x)
    logits = layers.Dense(num_classes)(x)

    return models.Model(inputs=inputs, outputs=[logits, encoder_output])

In [16]:
def TransformerEncoderClassifier_1(vocab_size, d_model, num_heads, d_ff, num_layers, num_classes, max_len):
    inputs = layers.Input(shape=(None,), name="input_tokens")
    x = layers.Embedding(vocab_size, d_model, name="token_embedding")(inputs)
    x = PositionalEncoding_1(max_len, d_model)(x)

    for i in range(num_layers):
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads, name=f"mha_{i}")(x, x)
        x = layers.Add()([x, attn_output])
        x = layers.LayerNormalization(name=f"attn_norm_{i}")(x)

        ff_output = layers.Dense(d_ff, activation='relu', name=f"ffn_{i}_1")(x)
        ff_output = layers.Dense(d_model, name=f"ffn_{i}_2")(ff_output)
        x = layers.Add()([x, ff_output])
        x = layers.LayerNormalization(name=f"ffn_norm_{i}")(x)

    encoder_output = x
    pooled_output = layers.GlobalAveragePooling1D(name="global_avg_pool")(encoder_output)
    logits = layers.Dense(num_classes, name="classifier_logits")(pooled_output)

    return models.Model(inputs=inputs, outputs=[logits, encoder_output], name="TransformerEncoderClassifier_1")

In [18]:
import pickle

def load_variable(pkl_variable):
  # Open the file in binary mode
  with open(pkl_variable, 'rb') as file:
    myvar = pickle.load(file)
  return myvar

In [19]:
vocab = load_variable('/content/drive/MyDrive/NLP/vocab.pkl')

In [20]:
def get_token_id(vocab, lemmatize_sentence, max_size):
  token_id = []
  # max_size = len(lemmatize_sentence)
  for word in lemmatize_sentence:
    if word in vocab.keys():
      token_id.append(vocab[word])
    else:
      token_id.append(len(vocab))
  while(len(token_id) < max_size):
    token_id.append(len(vocab))
  token_id = tf.constant([token_id])
  return token_id

In [None]:
token_id_1 = get_token_id(vocab, sentence_1, 45)
token_id_2 = get_token_id(vocab, sentence_2, 105)

In [21]:
def get_embeddings(vocab, token_id):
    model = TransformerEncoderClassifier(vocab_size=2*len(vocab), d_model=256, num_heads=4, d_ff=128, num_layers=2, num_classes=2, max_len=5*token_id.shape[1])
    embedding_model = tf.keras.Model(
        inputs=model.input,
        outputs=model.output[1]  # Only encoder output
    )
    embedding = embedding_model.predict(token_id)
    return embedding

In [22]:
model_1 = model = TransformerEncoderClassifier_1(
        vocab_size=2000,
        d_model=256,
        num_heads=4,
        d_ff=128,
        num_layers=4,
        num_classes=2,
        max_len=5 * 45
    )

In [23]:
model_2 = model = TransformerEncoderClassifier_1(
        vocab_size=2000,
        d_model=256,
        num_heads=4,
        d_ff=128,
        num_layers=4,
        num_classes=2,
        max_len=5 * 105
    )

In [24]:
def get_embeddings_1(vocab, token_id):
    if token_id.shape[1] == 45:
      model = model_1
    else:
      model = model_2
    embedding_model = tf.keras.Model(
        inputs=model.input,
        outputs=model.get_layer("global_avg_pool").output  # or encoder_output if you want full sequence
    )
    embedding = embedding_model.predict(token_id)
    return embedding

In [57]:
def load_my_model():
  model = tf.keras.models.load_model('/content/drive/MyDrive/NLP/NLP/model_10_v1.keras', safe_mode= 'False')
  return model

In [58]:
def stack_tensors(tensor_list):
    stacked_tensor = tf.stack(tensor_list)
    return stacked_tensor

In [59]:
def stack_tensors_1(emb, total_size, d_model_val):
  emb = stack_tensors(emb)
  emb = tf.reshape(emb, [total_size, d_model_val])
  return emb

In [66]:
def get_score(resume_emb_stack_1, job_emb_stack_1, check):
    model = load_my_model()
    similarity = model.predict([resume_emb_stack_1, job_emb_stack_1])
    return 1 - similarity

In [63]:
def extract_data(resume_data):
  import re
  pattern = r"Technical Skills(.*?)Extracurricular Activities"
  matches = re.search(pattern, resume_data, re.DOTALL)

  if matches:
      technical_skills = matches.group(1).strip()
      # print("### Technical Skills ###")
      # print(technical_skills)
  else:
      print("Technical Skills section not found.")

  skills_list = re.split(r'\n(?=•)', technical_skills.strip())

  cleaned_text = [re.sub(r'\s+', ' ', item).replace('•', '').strip() for item in skills_list]

  result_dict = {}
  for item in cleaned_text:
      key, value = item.split(':', 1)
      result_dict[key.strip()] = value.strip()

  print(result_dict)

  final_text = "Proficient in " + result_dict['Programming Languages'] + ". Knowns Web Technologies which includes " + result_dict["Web Technologies"] + ". Familiar with " + result_dict["Engineering Software"]
  return final_text

In [64]:
def similarity_score(sentence_1, sentence_2):

  import tensorflow as tf
  from tensorflow.keras import layers, models

  check = sentence_2[-1]

  sentence_1 = lemmatize_sentence(sentence_1)
  sentence_2 = lemmatize_sentence(sentence_2)

  token_id_1 = get_token_id(vocab, sentence_1, 45)
  token_id_2 = get_token_id(vocab, sentence_2, 105)

  emb_1 = get_embeddings_1(vocab, token_id_1)
  emb_2 = get_embeddings_1(vocab, token_id_2)

  resume_emb_stack_1 = stack_tensors_1(emb_1, len(emb_1), emb_1[0].shape[0])
  job_emb_stack_1 = stack_tensors_1(emb_2, len(emb_2), emb_2[0].shape[0])

  score = get_score(emb_1, emb_2, check)
  return score

In [73]:
import gradio as gr
import fitz  # PyMuPDF

# Function to read the PDF file using PyMuPDF
def read_pdf(file):
    try:
        # Open the uploaded PDF file using PyMuPDF
        doc = fitz.open(file.name)
        resume_data = ""

        # Extract resume_data from all pages in the PDF
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)  # Load each page
            resume_data += page.get_text()  # Extract resume_data from the page

        sentence_1 = extract_data(resume_data)
        sentence_2 = 'Requred a software developer who is proficient in C, Java, Python, JavaScript, PHP and related languages. Knowns Web Technologies which includes HTML, CSS, Django. Familiar with Visual Studio, GitHub, PyCharm, IntelliJ, MySQL, GNU 8085 Simulator.T'
        # sentence_2 = 'As a Personal Trainer, you will design personalized fitness programs that help clients achieve their physical health goals. Your role involves motivating clients to push their limits, providing expert advice on exercise techniques, and offering nutritional guidance. You will work with individuals at different fitness levels, providing support and encouragement to help them improve their strength, endurance, and overall well-being. The role demands a passion for fitness, excellent interpersonal skills, and the ability to inspire others. You will also stay up-to-date with the latest trends in health and fitness to ensure that your training methods remain effective and innovative.F'

        matching_score = similarity_score(sentence_1, sentence_2)
        matching = 1 if matching_score[0][0] >= 0.5 else 0

        return sentence_1, matching

    except Exception as e:
        return f"Error reading PDF: {e}"

# Create a Gradio interface
iface = gr.Interface(
    fn=read_pdf,  # Function to process the uploaded PDF
    inputs=gr.File(label="Upload a PDF File"),  # File input for uploading PDF
    outputs = [
        gr.Textbox(label="Parsed Text"),  # Display the extracted text
        gr.Textbox(label="Similairty")
    ]
)

# Launch the Gradio app
iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://50796f8f213085d5d6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


