In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib
import numpy as np

In [5]:
path = 'drive/MyDrive/capstone/raw_data/jobs3.csv'
df = pd.read_csv(path)


In [7]:
# Function to process data in batches
def process_batch(data_batch, tfidf, svd):
    tfidf_matrix = tfidf.fit_transform(data_batch['job_skills'])  # Sparse matrix
    tfidf_reduced = svd.fit_transform(tfidf_matrix)
    return tfidf, svd, tfidf_reduced

# Reading data in batches and processing it
def build_model_and_reduce(file_path, batch_size=10000):
    # Initialize TfidfVectorizer and TruncatedSVD
    tfidf = TfidfVectorizer(stop_words='english')
    svd = TruncatedSVD(n_components=2000)

    chunk_iter = pd.read_csv(file_path, chunksize=batch_size)
    tfidf_matrices = []

    for i, data_batch in enumerate(chunk_iter):
        print(f"Processing batch {i + 1}")
        data_batch['job_skills'] = data_batch['job_skills'].apply(lambda x: x.lower())

        tfidf, svd, tfidf_reduced = process_batch(data_batch, tfidf, svd)
        tfidf_matrices.append(tfidf_reduced)

    # Concatenate all reduced matrices
    tfidf_matrix_reduced = np.vstack(tfidf_matrices)

    return tfidf, svd, tfidf_matrix_reduced

# Path to your CSV file
file_path = path

# Build the models and reduce the TF-IDF matrix
tfidf, svd, tfidf_matrix_reduced = build_model_and_reduce(file_path)

# Save the models and the reduced TF-IDF matrix
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(svd, 'svd_model.pkl')
joblib.dump(tfidf_matrix_reduced, 'tfidf_matrix_reduced.pkl')

print("TF-IDF vectorizer, SVD model, and reduced TF-IDF matrix saved.")

Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Processing batch 7
Processing batch 8
Processing batch 9
Processing batch 10
TF-IDF vectorizer, SVD model, and reduced TF-IDF matrix saved.


In [8]:
var_explained = svd.explained_variance_ratio_.sum()
print(var_explained)

0.8165697613143268


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import average_precision_score

In [17]:
# Memuat model yang telah disimpan
tfidf = joblib.load('tfidf_vectorizer.pkl')
svd = joblib.load('svd_model.pkl')
tfidf_matrix_reduced = joblib.load('tfidf_matrix_reduced.pkl')

job_data = pd.read_csv(path)

def preprocess_text(text):
    # Implement your preprocessing steps here
    return text.lower()

def recommend_jobs(input_skills, top_n=10):
    input_skills = preprocess_text(input_skills)
    input_tfidf = tfidf.transform([input_skills])
    input_svd = svd.transform(input_tfidf)
    cosine_similarities = cosine_similarity(input_svd, tfidf_matrix_reduced).flatten()
    related_jobs_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return job_data.iloc[related_jobs_indices]

In [33]:
input_skills = "python"
recommended_jobs = recommend_jobs(input_skills)
print(recommended_jobs)

                                                job_link  \
93262  https://www.linkedin.com/jobs/view/refactor-ex...   
96407  https://www.linkedin.com/jobs/view/senior-soft...   
96141  https://www.linkedin.com/jobs/view/senior-back...   
91002  https://ca.linkedin.com/jobs/view/firmware-des...   
94588  https://au.linkedin.com/jobs/view/staff-data-e...   
94783  https://www.linkedin.com/jobs/view/physical-sc...   
98460  https://www.linkedin.com/jobs/view/cybersecuri...   
96647  https://www.linkedin.com/jobs/view/senior-soft...   
93352  https://www.linkedin.com/jobs/view/senior-soft...   
97170  https://www.linkedin.com/jobs/view/engineering...   

                 last_processed_time got_summary got_ner is_being_worked  \
93262  2024-01-19 09:45:09.215838+00           t       t               f   
96407  2024-01-19 09:45:09.215838+00           t       t               f   
96141  2024-01-20 00:54:30.104212+00           t       t               f   
91002  2024-01-19 09:45:09.215838+0

In [26]:
def mean_average_precision_at_k(y_true, y_scores, k=10):
    """
    Calculate mean average precision at k.
    """
    average_precision = []
    for true, scores in zip(y_true, y_scores):
        # Get indices of top k scores
        top_k_indices = np.argsort(scores)[::-1][:k]
        relevant = np.isin(top_k_indices, np.where(true == 1)[0]).astype(int)
        precision_at_k = np.cumsum(relevant) / (np.arange(k) + 1)
        average_precision.append(np.sum(precision_at_k * relevant) / min(k, np.sum(true)))
    return np.mean(average_precision)

def evaluate_model(test_data, top_n=10):
    """
    Evaluate the model using test data and calculate the MAP score.
    """
    y_true = []
    y_scores = []

    for _, row in test_data.iterrows():
        input_skills = preprocess_text(row['job_skills'])
        input_tfidf = tfidf.transform([input_skills])
        input_svd = svd.transform(input_tfidf)
        cosine_similarities = cosine_similarity(input_svd, tfidf_matrix_reduced).flatten()

        # Generate ground truth and scores
        true_labels = np.zeros(tfidf_matrix_reduced.shape[0])
        job_indices = job_data.index[job_data['job_title'] == row['job_title']].tolist()
        true_labels[job_indices] = 1
        y_true.append(true_labels)
        y_scores.append(cosine_similarities)

    map_score = mean_average_precision_at_k(y_true, y_scores, k=top_n)
    return map_score

In [31]:
test_data = df.sample(n= 100, random_state= 42)

In [32]:
map_score = evaluate_model(test_data, top_n=10)
print(f'Mean Average Precision (MAP) Score: {map_score}')

Mean Average Precision (MAP) Score: 0.09493796296296296
