In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import faiss
import joblib
import numpy as np

In [None]:
path = 'drive/MyDrive/capstone/raw_data/jobs3.csv'
df = pd.read_csv(path)
path2 = 'drive/MyDrive/capstone/raw_data/jobs2.csv'
df2 = pd.read_csv(path2)
df

Unnamed: 0,job_link,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,https://www.linkedin.com/jobs/view/events-mana...,2024-01-20 10:12:25.028788+00,t,t,f,Events Manager,State of Wisconsin,"Madison, WI",2024-01-14,New Glarus,United States,Vendor,Mid senior,Onsite,"Event Management, Hospitality Management, Publ..."
1,https://www.linkedin.com/jobs/view/general-sup...,2024-01-19 09:45:09.215838+00,t,t,f,General Superintendent - Lead Field Ops at a G...,Michael Page,"Milwaukee, WI",2024-01-13,Milwaukee,United States,Field Contractor,Mid senior,Onsite,"Construction Management, Construction Superint..."
2,https://www.linkedin.com/jobs/view/grill-cook-...,2024-01-19 09:45:09.215838+00,t,t,f,Grill Cook,"Bob Evans Restaurants, LLC","Princeton, IN",2024-01-13,Indiana,United States,Cook Apprentice,Associate,Onsite,"Restaurant experience, Knowledge of cooking pr..."
3,https://www.linkedin.com/jobs/view/billing-tea...,2024-01-21 06:22:55.474052+00,t,t,f,Billing Team Lead,Leddy Group,"Manchester, NH",2024-01-14,Derry,United States,Supervisor,Associate,Onsite,"Medical billing, Collections, Account Receivab..."
4,https://www.linkedin.com/jobs/view/rn-ccu-at-c...,2024-01-21 03:34:01.590108+00,t,t,f,RN - CCU at Community Health Systems,Health eCareers,"Hattiesburg, MS",2024-01-14,Magnolia,United States,Nurse Staff Community Health,Mid senior,Onsite,"Nursing, Healthcare, Patient Care, EMR, Commun..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,https://www.linkedin.com/jobs/view/solar-sales...,2024-01-19 13:01:07.618251+00,t,t,f,Solar Sales Consultant,"V3 Electric, Inc","Fairfield, CA",2024-01-14,Antioch,United States,Sales Agent Insurance,Mid senior,Onsite,"Sales, Customer service, Communication, Presen..."
99996,https://ca.linkedin.com/jobs/view/sp%C3%A9cial...,2024-01-20 02:42:18.802659+00,t,t,f,Spécialiste en procédés administratifs - Respo...,Centre universitaire de santé McGill | McGill ...,"Montreal, Quebec, Canada",2024-01-16,Cap-de-la-Madeleine,Canada,Intelligence Specialist,Mid senior,Onsite,"Data protection, Data management, HIPAA, Data ..."
99997,https://uk.linkedin.com/jobs/view/b2b-customer...,2024-01-19 09:45:09.215838+00,t,t,f,B2B Customer Service / Account Support. Global...,Recruitment Revolution,"Caerphilly, Wales, United Kingdom",2024-01-13,Cardiff,United Kingdom,Customer Services Coordinator,Mid senior,Onsite,"Customer Service, Account Management, Sales, C..."
99998,https://www.linkedin.com/jobs/view/operations-...,2024-01-20 00:44:02.716032+00,t,t,f,OPERATIONS ASSISTANT MANAGER,Dollar Tree Stores,"Marion, IA",2024-01-14,Amana Colonies,United States,Set-Key Driver,Mid senior,Onsite,"Retail, Management, Communication, Interperson..."


In [None]:
# Menginisialisasi TfidfVectorizer dan TruncatedSVD
tfidf = TfidfVectorizer(stop_words='english')
svd = TruncatedSVD(n_components=2000)  # Misalnya, kita memilih 100 komponen, max_df=0.8, min_df=2, ngram_range=(1, 2)

# Fungsi untuk memproses data dalam batch
def process_batch(data_batch):
    tfidf_matrix = tfidf.fit_transform(data_batch['job_skills'])  # Sparse matrix
    tfidf_reduced = svd.fit_transform(tfidf_matrix)
    return tfidf_reduced

# Membaca data dalam batch dan mengisi FAISS index
def build_faiss_index(file_path, batch_size=10000):
    index = None
    chunk_iter = pd.read_csv(file_path, chunksize=batch_size)

    for i, data_batch in enumerate(chunk_iter):
        print(f"Processing batch {i + 1}")
        data_batch['job_skills'] = data_batch['job_skills'].fillna('')

        # Proses batch
        tfidf_reduced = process_batch(data_batch)

        if index is None:
            d = tfidf_reduced.shape[1]
            index = faiss.IndexFlatL2(d)  # Membuat indeks FAISS jika belum ada

        index.add(tfidf_reduced)

    return index

# Memuat dan memproses data
file_path = path
faiss_index = build_faiss_index(path)

# Menyimpan model
joblib.dump(tfidf, 'tfidf_model.pkl')
joblib.dump(svd, 'svd_model.pkl')
faiss.write_index(faiss_index, 'faiss_index.idx')

Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Processing batch 7
Processing batch 8
Processing batch 9
Processing batch 10


In [None]:
var_explained = svd.explained_variance_ratio_.sum()
print(var_explained)

0.8165644714173347


In [None]:
# Memuat model yang telah disimpan
tfidf = joblib.load('tfidf_model.pkl')
svd = joblib.load('svd_model.pkl')
faiss_index = faiss.read_index('faiss_index.idx')

# Fungsi untuk mendapatkan rekomendasi
def get_recommendations(index, tfidf, svd, skills_input, n_recommendations=10):
    skills_tfidf = tfidf.transform([skills_input])
    skills_reduced = svd.transform(skills_tfidf)

    distances, indices = index.search(skills_reduced, n_recommendations)

    # Memuat kembali data untuk mendapatkan nama pekerjaan

    return df['job_title'].iloc[indices[0]]

In [None]:
input_skills = "physics"
recommended_jobs = get_recommendations(faiss_index, tfidf, svd, input_skills)
print(recommended_jobs)

90198                           Oncology Physicist
93136    Tenure-Track Assistant Professor, Physics
96195             Oncology Physicist - hybrid role
89870              Medical Technologist Generalist
97799                            Senior VFX Artist
74087                      Production Area Manager
48096          Retrofit Coordinator in Bristol, UK
24625      Head Coach Junior High Girls Volleyball
94027          Senior Product Development Engineer
93083      Principal Process Engineer - Wet Cleans
Name: job_title, dtype: object


In [None]:
# Fungsi untuk menghitung Average Precision
def average_precision_at_k(actual, predicted, k):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# Fungsi untuk menghitung MAP
def mean_average_precision(test_data, n_recommendations=10):
    scores = []

    for _, row in test_data.iterrows():
        input_skills = row["job_skills"]
        actual_jobs = [row["job_title"]]

        recommended_jobs = get_recommendations(faiss_index, tfidf, svd, input_skills, n_recommendations)
        score = average_precision_at_k(actual_jobs, recommended_jobs, n_recommendations)
        scores.append(score)

    return np.mean(scores)

In [None]:
test_data = df.sample(n= 100, random_state= 2)

In [3]:
map_score = mean_average_precision(test_data, n_recommendations=10)
print(f"Mean Average Precision: {map_score:.2f}")

NameError: name 'mean_average_precision' is not defined