In [1]:
import pandas as pd

# Load your CSV
df = pd.read_csv("resume_screening_job_dataset.csv")

with open("resume_train.txt", "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        label = "__label__" + row["JobRole"].replace(" ","_")
        skills = row["Skills"].replace(";", " ")  # replace semicolons with spaces
        text = f"{label} {skills}\n"
        f.write(text)

In [2]:
import fasttext

model = fasttext.train_supervised(
    input="resume_train.txt",     # file with "__label__category text"
    epoch=200,                      # 500 is too high â€“ causes overfitting
    lr=0.5,                        # 1.0 is fine but 0.5 is more stable
    wordNgrams=1,                  # better for skill combinations
    dim=100,                       # embedding dimension for better accuracy
    bucket=200000,                 # supports unseen n-grams (important for skills)
    verbose=2
)

model.save_model("resume_fasttext_model.bin")

In [3]:
# Predict job role for a new resume
resume_text = "Python, SQL, NLP, Keras, Tensorflow, pytorch, spacy, nltk, scikit learn"
print(model.predict(resume_text,k=3))

(('__label__NLP_Engineering', '__label__Machine_Learning_Engineering', '__label__DevOps_/_Site_Reliability_Engineering'), array([0.47705701, 0.24004334, 0.10426509]))


In [4]:
# Predict job role for a new resume
resume_text = "CI/CD for ML; ML pipelines; kubeflow; MLflow; model versioning; feature stores; data versioning; docker; kubernetes; monitoring; concept drift detection; model performance tracking; A/B testing; canary releases; automation; orchestration; airflow; terraform basics; cloud ML services"
print(model.predict(resume_text,k=3))

(('__label__MLOps_Engineering', '__label__Machine_Learning_Engineering', '__label__DevOps_/_Site_Reliability_Engineering'), array([0.90727121, 0.0751943 , 0.00522642]))


In [5]:
# Predict job role for a new resume
resume_text = 'agentic ai, git, flask, deep learning, r, transformers, numpy, word2vec, teamwork, keras, html, data analysis, nltk, matplotlib, vs code, prompt engineering, generative ai, scikit-learn, langchain, agno, nlp, rag, gensim, fine tuning, tableau, css, tensorflow, python, genai, pandas, jupyter notebook, streamlit, embeddings, vector databases, chromadb, sql, seaborn, spacy, llms, machine learning, communication, cursor, fasttext'
print(model.predict(resume_text,k=3))

(('__label__Data_Engineering', '__label__Data_Science', '__label__Machine_Learning_Engineering'), array([0.47142759, 0.17958473, 0.12672079]))


In [6]:
model.get_nearest_neighbors('python')

[(0.848450243473053, 'logging'),
 (0.7648621797561646, 'pandas'),
 (0.7633575201034546, 'numpy'),
 (0.7613005638122559, 'scikit'),
 (0.7603680491447449, 'learn'),
 (0.7539741396903992, 'feature'),
 (0.7423085570335388, 'pipelines'),
 (0.7371772527694702, 'cloud'),
 (0.7291809916496277, 'java'),
 (0.7021704316139221, 'microservices')]

In [7]:
model.get_nearest_neighbors('pandas')

[(0.9977405667304993, 'learn'),
 (0.9975672364234924, 'numpy'),
 (0.9972259998321533, 'scikit'),
 (0.8745607733726501, 'feature'),
 (0.8473239541053772, 'validation'),
 (0.8461511731147766, 'MLOps'),
 (0.7918605208396912, 'machine'),
 (0.7682293057441711, 'model'),
 (0.7648622393608093, 'python'),
 (0.7607007622718811, 'batch')]