### Mounting google drive

In [10]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

### Added to support TPU

In [2]:
# Check TPU availability
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")  # Apple Silicon
    elif torch.xpu.is_available():
        return torch.device("xpu")  # Intel GPU
    else:
        return torch.device("cpu")

device = get_device()

### Loading data

In [21]:
# Load Dataset
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    df = df[['product_name', 'main_category', 'Features', 'rating']].copy()
    df.fillna('', inplace=True)
    df['combined_features'] = df.apply(
        lambda row: f"Category: {row['main_category']} Features: {row['Features']}", axis=1
    )
    return df

### Getting distilBert tokenizer

In [25]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

### Pre-processing data for bert with attension mask

In [24]:
# Prepare Data for Training
def prepare_data(df):
    texts = df['combined_features'].tolist()
    labels = torch.tensor(df['rating'].tolist(), dtype=torch.float32).to(device)
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    dataset = TensorDataset(encodings['input_ids'].to(device), encodings['attention_mask'].to(device), labels)
    return dataset

### Loading model

In [26]:
# Load Model
def load_model():
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)
    model.to(device)
    return model

### Training model with 3 epochs and batch processing

In [27]:
# Train Model
def train_model(df, epochs=3, batch_size=16):
    dataset = prepare_data(df)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = load_model()
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = nn.SmoothL1Loss()  # Better for ranking

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            output = model(input_ids, attention_mask, labels=labels.unsqueeze(1))
            loss = loss_fn(output.logits, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")
    return model

### Function to predict similarity score

In [28]:
# Predict Function
def predict(model, query):
    encoding = tokenizer(query, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    encoding = {key: value.to(device) for key, value in encoding.items()}
    with torch.no_grad():
        output = model(encoding['input_ids'], encoding['attention_mask'])

    score = output.logits.item()
    return np.clip(score / 5.0, 0, 1)  # Normalize to 0-1 range

### Training model

In [29]:
# Example Usage
csv_path = '/content/drive/MyDrive/Pyramyd OA/G2 software product overview.csv'
df = load_data(csv_path)
model = train_model(df)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 1.0038079870125605
Epoch 2, Loss: 0.07223355487757732
Epoch 3, Loss: 0.07472688893950175


### Searching vendors based on similarity

In [30]:
# Search Vendors
def search_vendors(model, df, category, feature_query, similarity_threshold=0.5):
    query = f"Category: {category} Features: {feature_query}"

    # Filter category first
    category_mask = df['main_category'].str.lower().str.contains(category.lower(), na=False)
    filtered_df = df[category_mask].copy()

    if filtered_df.empty:
        print(f"No vendors found for category: {category}")
        return pd.DataFrame()

    # Predict similarity for each vendor
    filtered_df['similarity_score'] = filtered_df['combined_features'].apply(lambda text: predict(model, text))

    # Filter by similarity threshold
    matched_vendors = filtered_df[filtered_df['similarity_score'] >= similarity_threshold].copy()

    if matched_vendors.empty:
        print("No relevant vendors found based on the feature query.")
        return pd.DataFrame()

    # Rank vendors by similarity and rating
    matched_vendors['rating_normalized'] = matched_vendors['rating'] / 5.0
    matched_vendors['final_score'] = (0.7 * matched_vendors['similarity_score'] +
                                      0.3 * matched_vendors['rating_normalized'])

    # Sort by ranking score
    ranked_vendors = matched_vendors.sort_values('final_score', ascending=False).reset_index(drop=True)
    return ranked_vendors[['product_name', 'main_category', 'rating', 'similarity_score', 'final_score']]

### Ranking vendors and getting top 10 vendors

In [None]:
category = "Accounting & Finance Software"
feature_query = "Budgeting"
matched_vendors_df = search_vendors(model, df, category, feature_query)

# Display top results
print(matched_vendors_df.head(10))