In [1]:
import sys
import os

# find the root directory of the project
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
from scripts.mongo_ingestion import MongoDBClient
from scripts.config import MONGO_URI, DB_NAME
import pandas as pd


with MongoDBClient(mongo_uri=MONGO_URI, dbname=DB_NAME) as client:
    client.ping()
    collection = client.get_collection('jobs')
    docs = client.find('jobs',
                       projection={'_id': 1, 'jobPosition': 1, 'jobDescList': 1, 'jobQualificationsList': 1},
                       limit=10)

df = pd.DataFrame(docs)

  from .autonotebook import tqdm as notebook_tqdm
2025-05-26 11:27:09,482 - INFO - MongoDB Atlas ping successfully.


In [3]:
df

Unnamed: 0,_id,jobPosition,jobDescList,jobQualificationsList
0,6831c4b35ff150584a2a66d6,Back End Developer,"[Utilize languages like Go, PHP/Laravel, and P...",[Minimum Bachelor of Science in computer scien...
1,6831c4b35ff150584a2a66d7,(Freelance) Sales Retail Mobile/Canvasser,[Retail Sales Mobile akan bertanggung jawab un...,"[Pendidikan minimal SMA/SMK/sederajat., Memili..."
2,6831c4b35ff150584a2a66d8,Creative Lead,"[Membuat laporan weekly & monthly, Memberikan ...",[Pendidikan minimal S1 Ilmu Komunikasi / Marke...
3,6831c4b35ff150584a2a66d9,Social Media Specialist,[We are looking for a Social Media Specialist ...,"[""Bachelors degree in Marketing, Communication..."
4,6831c4b35ff150584a2a66da,HRGA Staff area Padalarang Kabupaten Bandung B...,[Membantu perekrutan dan pengelolaan operasion...,"[Pengalaman HRGA minimal 1 Tahun, Memiliki kem..."
5,6831c4b35ff150584a2a66db,Backend Engineer,"[To build, maintain, and enhance our validator...",[years of experience in coding in at least one...
6,6831c4b35ff150584a2a66dc,Software Engineer in Test,"[Review requirements, specifications and techn...",[Must have experience web automation with Java...
7,6831c4b35ff150584a2a66dd,Product Manager,[Facilitate effective communication and collab...,[A bachelor’s degree or higher in a relevant f...
8,6831c4b35ff150584a2a66de,Digital Strategist,[Develop and refine digital marketing strategi...,[Minimum 3 years experience as a Digital Strat...
9,6831c4b35ff150584a2a66df,Art Director,"[Sebagai, Anda akan bertanggung jawab untuk me...","[Gelar sarjana di bidang desain grafis, komuni..."


In [4]:
from scripts.text_preprocessor import TextPreprocessor

In [5]:
text_preprocessor = TextPreprocessor()
ids = text_preprocessor.create_id_list(df)
ids

['6831c4b35ff150584a2a66d6',
 '6831c4b35ff150584a2a66d7',
 '6831c4b35ff150584a2a66d8',
 '6831c4b35ff150584a2a66d9',
 '6831c4b35ff150584a2a66da',
 '6831c4b35ff150584a2a66db',
 '6831c4b35ff150584a2a66dc',
 '6831c4b35ff150584a2a66dd',
 '6831c4b35ff150584a2a66de',
 '6831c4b35ff150584a2a66df']

In [6]:
job_titles = text_preprocessor.create_job_title_list(df)
job_titles

['Back End Developer',
 '(Freelance) Sales Retail Mobile/Canvasser',
 'Creative Lead',
 'Social Media Specialist',
 'HRGA Staff area Padalarang Kabupaten Bandung Barat',
 'Backend Engineer',
 'Software Engineer in Test',
 'Product Manager',
 'Digital Strategist',
 'Art Director']

In [7]:
job_texts = text_preprocessor.create_job_texts(df)
job_texts

['Jobdesk: Utilize languages like Go, PHP/Laravel, and Python for backend development and maintenance., Manage SQL (MySQL, PostgreSQL) and NoSQL (MongoDB) databases for data storage and retrieval., Implement and manage messaging systems such as Kafka, RabbitMQ, and Redis for enhanced performance., Design and implement APIs for effective frontend-backend communication., Regularly use Git for code versioning and collaborative development., Apply clean code architecture practices to ensure code quality and maintainability., Use object-oriented programming techniques for modular and scalable code development., Understand and implement Entity Relationship Diagrams and database normalization for efficient database design.. Requirements: Minimum Bachelor of Science in computer science, software engineering, programming, or equivalent, Proficiency with languages such as\\xa0Go, PHP/Laravel, and Python, Understanding\\xa0\\xa0Entity Relationship Diagram and normalization, Experience in SQL (my 

In [8]:
from scripts.chroma_connection import ChromaDBClient

# Untuk deskripsi pekerjaan
job_titles_client = ChromaDBClient(collection_name="job_titles")

# Tambahkan data ke koleksi
print("Adding job titles to ChromaDB...")
job_titles_client.add_texts(ids=ids, texts=job_titles)

print("Data added to ChromaDB collection.")
job_titles_client.query("backend", n_results=len(df))


2025-05-26 11:28:18,585 - INFO - Use pytorch device_name: cpu
2025-05-26 11:28:18,590 - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Adding job titles to ChromaDB...


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]


Data added to ChromaDB collection.


Batches: 100%|██████████| 1/1 [00:00<00:00, 15.65it/s]


Unnamed: 0,id,document,distance
0,6831c4b35ff150584a2a66db,Backend Engineer,12.984494
1,6831c4b35ff150584a2a66d6,Back End Developer,14.726101
2,6831c4b35ff150584a2a66da,HRGA Staff area Padalarang Kabupaten Bandung B...,36.943668
3,6831c4b35ff150584a2a66d8,Creative Lead,39.072746
4,6831c4b35ff150584a2a66d7,(Freelance) Sales Retail Mobile/Canvasser,39.772221
5,6831c4b35ff150584a2a66de,Digital Strategist,44.12611
6,6831c4b35ff150584a2a66dd,Product Manager,48.24807
7,6831c4b35ff150584a2a66dc,Software Engineer in Test,49.729469
8,6831c4b35ff150584a2a66d9,Social Media Specialist,50.21035
9,6831c4b35ff150584a2a66df,Art Director,51.373032


In [9]:
# storing query results
df1 = job_titles_client.query("backend", n_results=len(df))

Batches: 100%|██████████| 1/1 [00:00<00:00, 17.10it/s]


In [10]:
# for job descriptions and requirements
job_desc_requirements_client = ChromaDBClient(collection_name="jobdesk_requirements")

print("Adding job titles to ChromaDB...")
job_desc_requirements_client.add_texts(ids=ids, texts=job_texts)

print("Data added to ChromaDB collection.")
job_desc_requirements_client.query("backend", n_results=len(df))

2025-05-26 11:28:35,097 - INFO - Use pytorch device_name: cpu
2025-05-26 11:28:35,101 - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Adding job titles to ChromaDB...


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


Data added to ChromaDB collection.


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.71it/s]


Unnamed: 0,id,document,distance
0,6831c4b35ff150584a2a66d6,"Jobdesk: Utilize languages like Go, PHP/Larave...",24.098253
1,6831c4b35ff150584a2a66db,"Jobdesk: To build, maintain, and enhance our v...",27.833963
2,6831c4b35ff150584a2a66dd,Jobdesk: Facilitate effective communication an...,28.666151
3,6831c4b35ff150584a2a66da,Jobdesk: Membantu perekrutan dan pengelolaan o...,29.108858
4,6831c4b35ff150584a2a66df,"Jobdesk: Sebagai, Anda akan bertanggung jawab ...",30.61776
5,6831c4b35ff150584a2a66d8,"Jobdesk: Membuat laporan weekly & monthly, Mem...",30.765408
6,6831c4b35ff150584a2a66dc,"Jobdesk: Review requirements, specifications a...",31.251518
7,6831c4b35ff150584a2a66de,Jobdesk: Develop and refine digital marketing ...,31.336758
8,6831c4b35ff150584a2a66d7,Jobdesk: Retail Sales Mobile akan bertanggung ...,32.796844
9,6831c4b35ff150584a2a66d9,Jobdesk: We are looking for a Social Media Spe...,36.057549


In [11]:
# storing query results
df2 = job_desc_requirements_client.query("backend", n_results=len(df))

Batches: 100%|██████████| 1/1 [00:00<00:00,  4.74it/s]


In [None]:
from scripts.top_similarity import TopSimilarity

top_similarity = TopSimilarity()  
similarity_ids = top_similarity.weighted_similarity(df1, df2)

In [None]:
# ordered by similarity
similarity_ids

[ObjectId('6831c4b35ff150584a2a66db'),
 ObjectId('6831c4b35ff150584a2a66d6'),
 ObjectId('6831c4b35ff150584a2a66dd'),
 ObjectId('6831c4b35ff150584a2a66da'),
 ObjectId('6831c4b35ff150584a2a66df')]