In [144]:
import os
import fitz
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [131]:
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

In [132]:
def get_labels(text):
    if re.search(r'education', text, re.IGNORECASE):
        return 1
    elif re.search(r'experience', text, re.IGNORECASE):
        return 2
    elif re.search(r'skills', text, re.IGNORECASE):
        return 3
    else:
        return 0

In [133]:
def process_data(folder_path):
    texts = []
    labels = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and file_path.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
            label = get_labels(text)
            if label != 0:
                texts.append(text)
                labels.append(label)
    return texts, labels

In [134]:
pdf_folder = 'data/ALL RESUMES'
texts, labels = process_data(pdf_folder)

In [135]:
max_words = 1000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 13573 unique tokens.


In [136]:
max_sequence_length = max(len(seq) for seq in sequences)
data = pad_sequences(sequences, maxlen=max_sequence_length)
dataset = pd.read_csv('data/ALL RESUMES/ALL-RESUMES.csv')
labels = dataset.iloc[:, -1].values

In [137]:
# Convert integer labels to one-hot encoded labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(labels)
labels = le.transform(labels)

In [138]:
print(labels)

[ 0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2
  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4
  4  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6  6  6  6  6  6  8  8  8
  8  8  8  8  8  8  8  7  7  7  7  7  7  7  7  7  7  9  9  9  9  9  9  9
  9  9  9 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 12
 12 12 12 12 12 12 12 12 12 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14
 14 14 14 14 14 15 15 15 15 15 15 15 15 15 15 16 16 16 16 16 16 16 16 16
 16 17 17 17 17 17 17 17 17 17 17 19 19 19 19 19 19 19 19 19 19 18 18 18
 18 18 18 18 18 18 18 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21 21
 21 21 21 22 22 22 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 23 24]


In [139]:
embedding_dim = 100
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_sequence_length),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # 1 unit for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [140]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 5166, 100)         100000    
                                                                 
 lstm_11 (LSTM)              (None, 64)                42240     
                                                                 
 dense_21 (Dense)            (None, 64)                4160      
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_22 (Dense)            (None, 1)                 65        
                                                                 
Total params: 146465 (572.13 KB)
Trainable params: 146465 (572.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [141]:
# Train the model
epochs = 10
batch_size = 32
model.fit(data, labels, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x249c7555d10>

In [146]:
y_pred = model.predict(data)



In [147]:
accuracy_score(labels, y_pred)

0.041666666666666664