## 📦 Importing Libraries  
We begin by importing essential libraries for text preprocessing, stemming, and creating an inverted index.


In [None]:
import re  # for text cleaning
from nltk.stem.porter import PorterStemmer  # for stemming (getting root words)
from collections import defaultdict  # better version of dictionary
import pandas as pd

## 📋 Sample Job Descriptions  
Here’s a sample dataset of job descriptions we’ll use to test our search engine.


In [None]:
df = pd.read_csv("sample_job_dataset.csv")
df.head()


## 🧹 Preprocessing Function  
This function cleans and stems the job description text to normalize it for matching.


In [None]:
# Create a stemmer object
stemmer = PorterStemmer()

# Function to clean and stem words
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # remove punctuation and make lowercase
    words = text.split()  # split into words
    stemmed = [stemmer.stem(word) for word in words]  # stem each word
    return stemmed


## 📚 Building the Inverted Index  
We create an inverted index to map each word to the job descriptions that contain it.


In [None]:
inverted_index = defaultdict(set)

# Fill the inverted index with job data from CSV
for _, row in df.iterrows():
    words = preprocess(row["description"])  # clean each job's description
    for word in words:
        inverted_index[word].add(row["id"])  # link each word to job ID

## 🔍 Keyword-Based Job Search  
This function searches for relevant jobs using keyword-based matching from the inverted index.



In [None]:
def search(query):
    query_words = preprocess(query)  # clean user query
    job_scores = defaultdict(int)  # store match score for each job

    for word in query_words:
        for job_id in inverted_index.get(word, []):  # get job IDs that contain the word
            job_scores[job_id] += 1  # add score if word matches

    # Sort jobs based on score (most relevant first)
    sorted_jobs = sorted(job_scores.items(), key=lambda x: x[1], reverse=True)

    if not sorted_jobs:
        print("No matching jobs found.")
        return

    print("Top matching jobs:\n")
    for job_id, score in sorted_jobs[:3]:  # show top 3 jobs
        job = df[df["id"] == job_id].iloc[0]  # get the job row from the dataframe
        print(f"🔹 {job['title']} (ID: {job_id}) — Match Score: {score}")
        print(f"📝 Description: {job['description']}\n")

## 🧑‍💻 Run a Keyword-Based Search  
User enters a query and the system returns matching job descriptions using keyword relevance.


In [None]:
# Ask the user for a search query
user_query = input("Enter job keywords (e.g., 'SQL Developer'): ")
search(user_query)


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["description"])


In [None]:
y = df["category"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
user_query = input("Enter your job-related query: ")
query_vec = vectorizer.transform([user_query])
predicted_category = model.predict(query_vec)[0]
print(f"\nPredicted Category: {predicted_category}")


In [None]:
print(f"\nTop jobs in '{predicted_category}' category:\n")
matches = df[df["category"] == predicted_category]

for i, row in matches.iterrows():
    print(f"🔹 {row['title']} (ID: {row['id']})")
    print(f"📝 Description: {row['description']}\n")


In [None]:
# Apply stemming to all job descriptions
processed_descriptions = [" ".join(preprocess(description)) for description in df['description']]


In [None]:
# Get and preprocess the user query
user_query = input("Enter job keywords (e.g., 'SQL Developer'): ")
processed_query = " ".join(preprocess(user_query))


In [None]:
# Combine job descriptions and query into one list
corpus = processed_descriptions + [processed_query]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(vectors[-1], vectors[:-1]).flatten()


In [None]:
top_indices = similarity_scores.argsort()[::-1][:3]


In [None]:
print("\nTop matching jobs:\n")
for idx in top_indices:
    job = df.iloc[idx]  # Access job data using the index
    score = similarity_scores[idx]
    print(f"🔹 {job['title']} (ID: {job['id']}) — Similarity Score: {round(score, 2)}")
    print(f"📝 Description: {job['description']}\n")