## Environment Initialization

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("./job_descriptions.csv")

## Data Preparation

In [None]:
import data_cleaning as dc

from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer

# Clean the text data
lemmatizer = WordNetLemmatizer()
df['Job Description'] = df['Job Description'].apply(lambda x: dc.preprocess_document(x, lemmatizer))

# Prepare the features
tfidf_vectorizer =TfidfVectorizer(max_df=0.95, min_df=5, 
                                  max_features=1000, 
                                  ngram_range=(1, 3))
X = tfidf_vectorizer.fit_transform(df['Job Description'])

# Prepare the targets
df['skills'] = df['skills'].apply(dc.preprocess_skills)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['skills'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Tuning

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Create a KNN classifier
model = KNeighborsClassifier(algorithm='brute', metric='cosine')

# Set up a grid search to find the best parameter for k
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
folds = 5
grid_search = GridSearchCV(model, param_grid=param_grid, cv=folds, verbose=2, n_jobs = -1, 
                           scoring='accuracy',
                           error_score='raise')

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Best number of neighbors
best_k = grid_search.best_params_['n_neighbors']

# Output the best model's score
print("Best number of neighbors:", best_k)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
# Create model with optimal number of clusters
knn_model = KNeighborsClassifier(n_neighbors=best_k, algorithm='brute', metric='cosine')

# Fit the model on the training data
knn_model.fit(X_train, y_train)

## Model Creation & Evaluation

In [None]:
from sklearn.metrics import classification_report

# Predict the labels for the test set
y_pred = knn_model.predict(X_test)

# Generate the scoring metric report
report = classification_report(y_test, y_pred)

print(report)

## Custom Examples

In [None]:
import model_util as mu

# Create sample job descriptions
job_descriptions = [
    "Seeking a financial accountant to join our auditing department.",
    "Urgently hiring a educational specialist in early-childhood development and special education curriculums.",
    "Looking for a skilled backend software developer with experience in networks and database design, and object-oriented programming in Java, Python, etc."
]

# Test the recommendation model
for desc in job_descriptions:
    print("Job description:", desc)
    skills = mu.model_get_skills(desc, tfidf_vectorizer, mlb, knn_model)
    print("Recommended skills:", skills)
    print()