In [1]:
import os
import ast
import re
import json
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import time
import gc
import torch
import torch.nn.functional as F

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
X_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge/data/x_train_Meacfjr.csv", delimiter=",", quotechar='"')
y_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge/data/y_train_SwJNMSu.csv", delimiter=",", quotechar='"')

# Convert 'job_ids' and 'actions' from strings to actual lists
X_train["job_ids"] = X_train["job_ids"].apply(ast.literal_eval)
X_train["actions"] = X_train["actions"].apply(ast.literal_eval)

df_jobs = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge/data/job_features.csv")

df_job_location = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Challenge/data/job_features_location.csv")

In [4]:
for _, row in df_job_location.iterrows():
    df_jobs.loc[df_jobs['job_id'] == row['job_id'], 'location'] = row['location']

print(df_jobs.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21917 entries, 0 to 21916
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_id           21917 non-null  int64 
 1   title_section    21913 non-null  object
 2   job_description  21917 non-null  object
 3   seniority        21897 non-null  object
 4   company          21917 non-null  object
 5   industry         21905 non-null  object
 6   location         21916 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.2+ MB
None


## GPU Requirements

In [5]:
!nvcc --version # find the CUDA driver build above

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [6]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Mar 11 14:14:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


## 1.3 Embedding of the Job Ads

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Define features and weights
features = ['title_section', 'location', 'seniority', 'company', 'industry']  # Example with multiple features

In [10]:
# from sentence_transformers import SentenceTransformer

# Load pre-trained sentence transformer model and move it to the GPU if available
# model = SentenceTransformer('all-MiniLM-L6-v2').to(device)  # intfloat/multilingual-e5-large

def compute_job_embeddings(df, features, embedding_dim=384):  # Adjust `embedding_dim` to match your model
    job_embeddings = {feature: [] for feature in features}  # Store embeddings per feature
    mask_vectors = {feature: [] for feature in features}  # Store masks (1 for valid, 0 for missing)

    # Compute all embeddings
    for idx, row in df.iterrows():
        if idx % 1000 == 0:
            print(idx)
        for feature in features:
            if pd.notna(row[feature]) and row[feature] is not None and len(row[feature]) > 1:  # If feature is not missing
                feature_vector = model.encode(str(row[feature]), convert_to_tensor=True, show_progress_bar=False).to(device)
                mask_vectors[feature].append(1)  # Mark as valid
            else:  # If feature is missing
                feature_vector = torch.zeros(embedding_dim, device=device)  # Assign zero vector
                mask_vectors[feature].append(0)  # Mark as missing

            job_embeddings[feature].append(feature_vector)

    # Convert to PyTorch tensors for each feature
    for feature in features:
        job_embeddings[feature] = torch.stack(job_embeddings[feature])
        mask_vectors[feature] = torch.tensor(mask_vectors[feature], dtype=torch.float32, device=device).view(-1, 1)

    # Zero-mean normalization only for valid embeddings
    for feature in features:
        valid_embeddings = job_embeddings[feature] * mask_vectors[feature]  # Apply mask
        mean = valid_embeddings.sum(dim=0) / mask_vectors[feature].sum()  # Compute mean ignoring missing rows
        job_embeddings[feature] = (job_embeddings[feature] - mean) * mask_vectors[feature]  # Normalize only valid rows

    return job_embeddings


In [11]:
# Load or Compute Embeddings
embedding_path = "/content/drive/MyDrive/Colab Notebooks/Challenge/data/job_embeddings.pth"
if os.path.exists(embedding_path):
    print("Loading embeddings from {}".format(embedding_path))
    job_embeddings = torch.load(embedding_path, map_location=device)
else:
    print("Computing embeddings...")
    job_embeddings = compute_job_embeddings(df_jobs, features)
    torch.save(job_embeddings, embedding_path)

# Add embeddings to DataFrame
for feature in features:
    df_jobs[f'embedding_{feature}'] = list(job_embeddings[feature].cpu())

# Create a mapping from job_id to its embeddings (per feature)
job_embedding_dict = {row['job_id']: {feature: row[f'embedding_{feature}'] for feature in features}
                      for _, row in df_jobs.iterrows()}

Loading embeddings from /content/drive/MyDrive/Colab Notebooks/Challenge/data/job_embeddings.pth


  job_embeddings = torch.load(embedding_path, map_location=device)


In [13]:
def preprocess_embeddings(df_jobs, job_embedding_dict):
    """ Convert job embeddings into a single tensor for fast access. """
    job_id_to_idx = {job_id: idx for idx, job_id in enumerate(df_jobs['job_id'].values)}

    job_embeddings_tensor = torch.stack([
        torch.stack([job_embedding_dict[job_id][feature] * feature_weights[feature] for feature in features]).sum(dim=0)
        for job_id in df_jobs['job_id']
    ]).to(torch.float32).to('cuda')  # Shape: (num_jobs, embedding_dim)

    job_ids_tensor = torch.tensor(df_jobs['job_id'].values, dtype=torch.long, device='cuda')  # (num_jobs,)

    return job_embeddings_tensor, job_ids_tensor, job_id_to_idx

def preprocess_sessions(x, job_id_to_idx, action_weights):
    """ Convert session interactions into tensor format for batch processing. """
    max_jobs_per_session = max(len(job_ids) for job_ids in x['job_ids'])

    session_ids = torch.tensor(x['session_id'].values, dtype=torch.long, device='cuda')

    job_indices = torch.full((len(x), max_jobs_per_session), -1, dtype=torch.long, device='cuda')
    action_weights_tensor = torch.zeros((len(x), max_jobs_per_session), dtype=torch.float32, device='cuda')

    for i, (job_ids, actions) in enumerate(zip(x['job_ids'], x['actions'])):
        valid_indices = [job_id_to_idx[jid] for jid in job_ids if jid in job_id_to_idx]
        job_indices[i, :len(valid_indices)] = torch.tensor(valid_indices, dtype=torch.long, device='cuda')
        action_weights_tensor[i, :len(valid_indices)] = torch.tensor(
            [action_weights[a] for a in actions[:len(valid_indices)]], dtype=torch.float32, device='cuda'
        )

    return session_ids, job_indices, action_weights_tensor


In [23]:
def loss_mrr(y_true, y_pred, verbose=True):
    """
    Computes Mean Reciprocal Rank (MRR).

    Args:
        y_true: dataframe of true job_id values per session.
        y_pred: Predicted action and 10 most relevant job_ids per session.

    Returns:
        MRR final score (float).
    """
    y_true_jobs = y_true["job_id"]
    reciprocal_ranks = []

    for true_job, (action, predicted_jobs) in zip(y_true_jobs, y_pred):
        try:
            rank = predicted_jobs.index(true_job) + 1  # Get 1-based rank
            reciprocal_ranks.append(1 / rank)
        except ValueError:
            reciprocal_ranks.append(0)  # y_true not found in top-10

    reciprocal_ranks_tensor = torch.tensor(reciprocal_ranks, dtype=torch.float32)

    # Count occurrences using torch (no Counter)
    unique_values, counts = torch.unique(reciprocal_ranks_tensor, return_counts=True)

    if verbose:
        # Display value counts
        print("\nReciprocal Rank Value Counts:")
        for val, count in zip(unique_values.tolist(), counts.tolist()):
            print(f"  {val:.4f}: {count} occurrences")
        print("\nMRR:", reciprocal_ranks_tensor.mean().item())

    return reciprocal_ranks_tensor.mean().item()

## Recommend list of the 10 most relevant jobs

In [30]:
feature_weights = {
    'title_section': 1.0,
    'location': 0.8,
    'seniority': 0.8,
    'company': 0.5,
    'industry': 0.5
    }

action_weights = {'apply': 1, 'view': 1}

In [26]:
# Function to generate recommendations for sessions dataframe using GPU
def recommend_jobs_for_sessions(x, df_jobs, job_embedding_dict, feature_weights, action_weights, top_n=10, batch_size=64):
    """ Fully parallelized job recommendation system using PyTorch tensor operations. """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Precompute job embeddings and indices for fast access
    job_embeddings_tensor, job_ids_tensor, job_id_to_idx = preprocess_embeddings(df_jobs, job_embedding_dict)

    # Preprocess session interactions
    session_ids, job_indices, action_weights_tensor = preprocess_sessions(x, job_id_to_idx, action_weights)

    all_recommendations = []

    for batch_start in range(0, len(x), batch_size):
        if (batch_start // batch_size + 1) % 20 == 0:
            print("Processing batch", batch_start // batch_size + 1, "of", len(x) // batch_size + 1)
        start_time = time.time()
        batch_end = min(batch_start + batch_size, len(x))
        batch_job_indices = job_indices[batch_start:batch_end]
        batch_action_weights = action_weights_tensor[batch_start:batch_end]

        # Mask out invalid job indices (-1)
        valid_mask = batch_job_indices != -1
        batch_job_embeddings = job_embeddings_tensor[batch_job_indices.clamp(min=0)] * valid_mask.unsqueeze(-1)

        # Compute user profiles in parallel
        batch_weighted_sums = (batch_job_embeddings * batch_action_weights.unsqueeze(-1)).sum(dim=1)
        batch_action_sums = batch_action_weights.sum(dim=1, keepdim=True).clamp(min=1e-6)
        batch_user_profiles = batch_weighted_sums / batch_action_sums  # (batch_size, embedding_dim)

        # Compute cosine similarity in parallel
        similarities = F.cosine_similarity(batch_user_profiles.unsqueeze(1), job_embeddings_tensor.unsqueeze(0), dim=-1)

        # Exclude jobs already interacted with
        batch_seen_jobs = (batch_job_indices.unsqueeze(-1) == job_ids_tensor).any(dim=1)
        similarities[batch_seen_jobs] = -float('inf')  # Mask seen jobs

        # Get top-N recommendations
        top_recommendations = similarities.topk(top_n, dim=-1).indices  # (batch_size, top_n)

        # Convert to list format
        for i in range(batch_start, batch_end):
            session_id = session_ids[i].item()
            recommended_job_ids = job_ids_tensor[top_recommendations[i - batch_start]].tolist()
            all_recommendations.append((session_id, recommended_job_ids))

        del batch_user_profiles, similarities, batch_seen_jobs
        torch.cuda.empty_cache()
        gc.collect()

    return all_recommendations

In [29]:
x, y = X_train, y_train
y_pred = recommend_jobs_for_sessions(x, df_jobs, job_embedding_dict, feature_weights, action_weights)

Processing batch 20 of 249
Processing batch 40 of 249
Processing batch 60 of 249
Processing batch 80 of 249
Processing batch 100 of 249
Processing batch 120 of 249
Processing batch 140 of 249
Processing batch 160 of 249
Processing batch 180 of 249
Processing batch 200 of 249
Processing batch 220 of 249
Processing batch 240 of 249


In [28]:
loss_mrr(y, y_pred)


Reciprocal Rank Value Counts:
  0.0000: 15319 occurrences
  0.1000: 55 occurrences
  0.1111: 56 occurrences
  0.1250: 70 occurrences
  0.1429: 40 occurrences
  0.1667: 45 occurrences
  0.2000: 58 occurrences
  0.2500: 50 occurrences
  0.3333: 79 occurrences
  0.5000: 63 occurrences
  1.0000: 47 occurrences

MRR: 0.01023925468325615


0.01023925468325615