In [0]:
# Calculate the date range for the last 7 days
from datetime import datetime, timedelta


date_range = 7
end_date = datetime.now() - timedelta(days=1)
# Add in correlationid to join search and click, tracking from 3/18/2025
start_date = max(datetime(2025, 3, 18), end_date - timedelta(days=date_range))
date_range = (end_date - start_date).days
print(f"Start date: {start_date}")
print(f"End date: {end_date}")
print(f"Date range: {date_range}")

Start date: 2025-03-18 00:00:00
End date: 2025-03-22 03:10:50.414925
Date range: 4


In [0]:
df = spark.sql(f"""select _token_associate_id as user_id, click_object_id as item_id, sum(click) as rating 
               from onedata_us_east_1_shared_dit.nas_raw_lyric_search_dit.ml_search_with_click
               group by 1, 2
               """)

In [0]:
df.toPandas().to_csv("data/search_click.csv", index=False)

In [0]:
df = spark.sql(f"""select click_object_id as item_id, click_details_caption as title  
               from onedata_us_east_1_shared_dit.nas_raw_lyric_search_dit.ml_search_with_click
               group by 1, 2
               """)

In [0]:
df.toPandas().to_csv("data/item_desc.csv", index=False)

In [0]:
df = spark.sql(f"""
select res1.*, res1.clicks / res2.clicks as rating
from
(select _token_associate_id, details_caption, count(*) as clicks
from onedata_us_east_1_shared_dit.nas_raw_lyric_search_dit.ml_search_click
where length(trim(details_caption)) > 0
group by _token_associate_id, details_caption) res1
inner join
(select _token_associate_id, count(*) as clicks
from onedata_us_east_1_shared_dit.nas_raw_lyric_search_dit.ml_search_click
where length(trim(details_caption)) > 0
group by _token_associate_id) res2
on res1._token_associate_id = res2._token_associate_id
""")

pdf = df.toPandas()
display(pdf)

_token_associate_id,details_caption,clicks,rating
27af8f44-9a62-41ba-ab66-a5e0ee9afba1,Request Time Off,1,0.5
27af8f44-9a62-41ba-ab66-a5e0ee9afba1,View My Time Off,1,0.5
3829c698-e618-47a4-a560-264450c55e9b,Search,2,0.0266666666666666
3829c698-e618-47a4-a560-264450c55e9b,View Profile,1,0.0133333333333333
3829c698-e618-47a4-a560-264450c55e9b,View Details,1,0.0133333333333333
3829c698-e618-47a4-a560-264450c55e9b,Go,2,0.0266666666666666
3829c698-e618-47a4-a560-264450c55e9b,Work Details,1,0.0133333333333333
3829c698-e618-47a4-a560-264450c55e9b,Documents,1,0.0133333333333333
3829c698-e618-47a4-a560-264450c55e9b,Legacy System,2,0.0266666666666666
3829c698-e618-47a4-a560-264450c55e9b,Time Data Import,2,0.0266666666666666


In [0]:
user_dic = list(set(pdf._token_associate_id))
user_to_id = { v: i for i, v in enumerate(user_dic) }
pdf['user_id'] = pdf['_token_associate_id'].map(user_to_id)

item_dic = list(set(pdf.details_caption))
item_to_id = { v: i for i, v in enumerate(item_dic) }
pdf['item_id'] = pdf['details_caption'].map(item_to_id)
pdf = pdf[['user_id', 'item_id', 'rating']]

In [0]:
display(pdf)

user_id,item_id,rating
918,9671,0.5
918,23,0.5
760,8380,0.0266666666666666
760,5348,0.0133333333333333
760,1505,0.0133333333333333
760,5952,0.0266666666666666
760,5783,0.0133333333333333
760,9329,0.0133333333333333
760,4133,0.0266666666666666
760,10008,0.0266666666666666


In [0]:
import numpy as np

np.random.seed(0)
arr = np.arange(pdf.shape[0])
np.random.shuffle(arr)
train_test_ratio = 0.9
train_index = arr[:int(len(arr)*train_test_ratio)]
test_index = arr[int(len(arr)*train_test_ratio):]
train_set = pdf.iloc[train_index,:]
test_set = pdf.iloc[test_index, :]
train_set = [tuple(row) for row in train_set.itertuples(index=False, name=None)]
test_set = [tuple(row) for row in test_set.itertuples(index=False, name=None)]

In [0]:
class ALS(nn.Module):
  def __init__(self, num_users, num_items, hidden_dim):
    super(ALS, self).__init__()
    self.user_Embedding = nn.Embedding(num_users, hidden_dim, max_norm = 1)
    self.item_Embedding = nn.Embedding(num_items, hidden_dim, max_norm = 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, user, item):
    user_embedding = self.user_Embedding(user)
    item_embedding = self.item_Embedding(item)
    out = torch.diag(user_embedding @ item_embedding.T)
    sigmoid_out = self.sigmoid(out)
    return sigmoid_out

In [0]:
num_users = pdf.user_id.nunique() + 1
print(num_users)
num_items = pdf.item_id.nunique() + 1
print(num_items)
num_factors = 100

942
10773


In [0]:
from sklearn.metrics import precision_score,recall_score,accuracy_score

def evaluation(y_pred, y_true):
  p = precision_score(y_true, y_pred)
  r = recall_score(y_true, y_pred)
  acc = accuracy_score(y_true,y_pred)
  return p,r,acc

In [0]:
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score,recall_score,accuracy_score
import torch.optim as optim

model = ALS(num_users, num_items, num_factors)
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.BCELoss()

num_epochs = 10
for epoch in range(num_epochs):
  for user, item, ratings in DataLoader(train_set, batch_size=256, shuffle=True):
    optimizer.zero_grad()
    user_tensor = torch.tensor(user)
    item_tensor = torch.tensor(item)
    predictions = model(user_tensor, item_tensor)
    # predictions = model(user, item)
    loss = loss_fn(predictions, ratings.float())
    loss.backward()
    optimizer.step()
  print(f"Epoch {epoch}, Loss: {loss.item()}")

  y_pred = np.array([1 if i >= 0.5 else 0 for i in predictions])
  # y_pred = predictions.detach().numpy()
  true_scores = np.array([1 if i >= 0.5 else 0 for i in ratings])
  y_true = np.array([1 if i >= 0.5 else 0 for i in true_scores])
  # y_true = ratings.detach().numpy()
  precision, recall, acc = evaluation(y_pred, y_true)
  print('train: Precision {:.4f} | Recall {:.4f} | accuracy {:.4f}'.format(precision, recall, acc))

  user_test = torch.tensor(test_set)[:,0].detach().long()
  item_test = torch.tensor(test_set)[:,1].detach().long()
  predictions = model(user_test, item_test)
  y_pred = np.array([1 if i >= 0.5 else 0 for i in predictions])
  true_scores = torch.tensor(test_set)[:,2].detach().float()
  y_true = np.array([1 if i >= 0.5 else 0 for i in true_scores])
  precision, recall, acc = evaluation(y_pred, y_true)
  print('test: Precision {:.4f} | Recall {:.4f} | accuracy {:.4f}'.format(precision, recall, acc))
  print('----------------------------------------------------------------------------------------')

  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 0, Loss: 0.48553791642189026
train: Precision 0.0714 | Recall 1.0000 | accuracy 0.8768
test: Precision 0.0127 | Recall 0.3571 | accuracy 0.8999
----------------------------------------------------------------------------------------


  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 1, Loss: 0.40716251730918884
train: Precision 1.0000 | Recall 1.0000 | accuracy 1.0000
test: Precision 0.0165 | Recall 0.4286 | accuracy 0.9080
----------------------------------------------------------------------------------------


  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 2, Loss: 0.3640108108520508
train: Precision 0.0000 | Recall 0.0000 | accuracy 1.0000
test: Precision 0.0133 | Recall 0.3571 | accuracy 0.9047
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 3, Loss: 0.3446067273616791
train: Precision 0.0000 | Recall 0.0000 | accuracy 1.0000
test: Precision 0.0185 | Recall 0.5000 | accuracy 0.9050
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 4, Loss: 0.3452398180961609
train: Precision 0.0000 | Recall 0.0000 | accuracy 1.0000
test: Precision 0.0164 | Recall 0.4286 | accuracy 0.9075
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 5, Loss: 0.33869990706443787
train: Precision 0.0000 | Recall 0.0000 | accuracy 0.9953
test: Precision 0.0188 | Recall 0.5000 | accuracy 0.9062
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 6, Loss: 0.33723652362823486
train: Precision 0.0000 | Recall 0.0000 | accuracy 1.0000
test: Precision 0.0163 | Recall 0.4286 | accuracy 0.9072
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 7, Loss: 0.339383602142334
train: Precision 0.0000 | Recall 0.0000 | accuracy 0.9953
test: Precision 0.0193 | Recall 0.5000 | accuracy 0.9087
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 8, Loss: 0.335794597864151
train: Precision 0.0000 | Recall 0.0000 | accuracy 0.9953
test: Precision 0.0167 | Recall 0.4286 | accuracy 0.9093
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  user_tensor = torch.tensor(user)
  item_tensor = torch.tensor(item)


Epoch 9, Loss: 0.333507776260376
train: Precision 0.0000 | Recall 0.0000 | accuracy 0.9905
test: Precision 0.0196 | Recall 0.5000 | accuracy 0.9100
----------------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
