In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import joblib

In [2]:
import os
os.getcwd()

'e:\\ResumeMatcher\\resume_matcher_ai\\notebooks'

In [3]:
df = pd.read_csv("../data/resume_jd_dataset.csv")

texts = (df["resume_text"] + " " + df["jd_text"]).tolist()
labels = df["label"].values

In [4]:
MAX_WORDS = 10000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=MAX_LEN)

y = to_categorical(labels, num_classes=3)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
model = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    LSTM(128),
    Dense(64, activation="relu"),
    Dense(3, activation="softmax")
])

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()



In [7]:
model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=16,
    validation_data=(X_test, y_test)
)

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.2500 - loss: 1.1022 - val_accuracy: 0.0000e+00 - val_loss: 1.0949
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 1.0000 - loss: 1.0823 - val_accuracy: 0.0000e+00 - val_loss: 1.0939
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 1.0000 - loss: 1.0669 - val_accuracy: 0.0000e+00 - val_loss: 1.0948
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 1.0000 - loss: 1.0495 - val_accuracy: 0.0000e+00 - val_loss: 1.0962
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 1.0000 - loss: 1.0318 - val_accuracy: 0.0000e+00 - val_loss: 1.0974


<keras.src.callbacks.history.History at 0x1c240c6e0d0>

In [8]:
model.save("../models/lstm_match_model.h5")
joblib.dump(tokenizer, "../models/tokenizer.pkl")



['../models/tokenizer.pkl']