In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.feature_extraction.text import TfidfTransformer

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print("CUDA Device Name:", torch.cuda.get_device_name(0))

Using device: cuda
CUDA Device Name: NVIDIA GeForce RTX 4060 Laptop GPU


In [9]:
transactions = pd.read_csv('./dataset/cleaned_dataset/cleaned_transactions.csv')

In [10]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,year,month,day_of_week,week_of_year,quarter,day,is_weekend,log_price,repeat_purchase
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,2018,9,3,38,3,20,False,0.049581,True
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,2018,9,3,38,3,20,False,0.030036,False
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,2018,9,3,38,3,20,False,0.015122,False
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2,2018,9,3,38,3,20,False,0.01679,False
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2,2018,9,3,38,3,20,False,0.01679,False


In [11]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28813419 entries, 0 to 28813418
Data columns (total 14 columns):
 #   Column            Dtype  
---  ------            -----  
 0   t_dat             object 
 1   customer_id       object 
 2   article_id        int64  
 3   price             float64
 4   sales_channel_id  int64  
 5   year              int64  
 6   month             int64  
 7   day_of_week       int64  
 8   week_of_year      int64  
 9   quarter           int64  
 10  day               int64  
 11  is_weekend        bool   
 12  log_price         float64
 13  repeat_purchase   bool   
dtypes: bool(2), float64(2), int64(8), object(2)
memory usage: 2.6+ GB


In [12]:
customer_map = {customer: idx for idx, customer in enumerate(transactions['customer_id'].unique())}
transactions['article_id'] = transactions['article_id'].astype(str).str.zfill(10)
article_map = {article: idx for idx, article in enumerate(transactions['article_id'].unique())}

transactions['customer_idx'] = transactions['customer_id'].map(customer_map)
transactions['article_idx'] = transactions['article_id'].map(article_map)

In [13]:
user_item_sparse = csr_matrix(
    (np.ones(len(transactions)), (transactions['customer_idx'], transactions['article_idx'])),
    shape=(len(customer_map), len(article_map))
)

print(user_item_sparse.shape)
save_npz("user_item_sparse.npz", user_item_sparse)
del user_item_sparse

(1362281, 104547)


In [2]:
user_item_sparse = load_npz("user_item_sparse.npz")
tfidf = TfidfTransformer()
user_item_weighted = tfidf.fit_transform(user_item_sparse)

In [None]:
svd = TruncatedSVD(n_components=200, random_state=42, n_iter=10)
user_factors = svd.fit_transform(user_item_weighted).astype(np.float32)
item_factors = svd.components_.astype(np.float32)

np.save('user_factors.npy', user_factors)
np.save('item_factors.npy', item_factors)

user_factors = np.load('user_factors.npy')
item_factors = np.load('item_factors.npy')

In [None]:
def process_user(user_id, top_n=10):
    user_vector = user_factors[user_id, :]
    user_ratings = np.dot(user_vector, item_factors)
    top_articles = np.argsort(user_ratings)[-top_n:]
    top_scores = user_ratings[top_articles]
    return user_id, [(int(article_idx), float(score)) for article_idx, score in zip(top_articles, top_scores)]

predicted_ratings_dict = {user_id: recommendations for user_id, recommendations in 
                           (process_user(user_id) for user_id in range(len(customer_map)))}

In [None]:
reverse_article_map = {idx: str(article).zfill(10) for article, idx in article_map.items()}
predicted_articles_dict = {
    int(user_id): [(int(reverse_article_map[article_idx]), float(score)) 
                   for article_idx, score in recommendations]
    for user_id, recommendations in predicted_ratings_dict.items()
}

svd_df = pd.DataFrame.from_dict(predicted_articles_dict, orient='index')
svd_df.to_csv("svd_recommendations.csv", index_label="customer_idx")
print("SVD recommendations saved.")
svd_df.head()

SVD recommendations saved.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"(448509014, 0.07383991777896881)","(741356002, 0.0875755101442337)","(609719001, 0.08960200101137161)","(458543001, 0.09383165836334229)","(699081001, 0.10933011025190353)","(673396002, 0.13577984273433685)","(608776002, 0.15751834213733673)","(699080001, 0.22136826813220978)","(723529001, 0.2644086182117462)","(351484002, 0.7037115097045898)"
1,"(575347003, 0.06499998271465302)","(562245046, 0.06550751626491547)","(572797001, 0.06637385487556458)","(720125001, 0.09793731570243835)","(372860002, 0.19594576954841614)","(160442043, 0.22707223892211914)","(608776002, 0.2493567317724228)","(160442010, 0.7563101053237915)","(372860001, 0.8183073997497559)","(160442007, 0.9933298230171204)"
2,"(692930001, 0.015450611710548401)","(448509014, 0.015733662992715836)","(689109001, 0.017918860539793968)","(351484002, 0.022607602179050446)","(599580017, 0.02452857978641987)","(684209013, 0.026517827063798904)","(688537011, 0.029406027868390083)","(684209004, 0.030774079263210297)","(590928001, 0.03607794642448425)","(688537004, 0.04274575412273407)"
3,"(573716012, 0.041612908244132996)","(572797001, 0.041665516793727875)","(608776002, 0.04422762989997864)","(673396002, 0.04531044512987137)","(179123001, 0.04760456830263138)","(678942001, 0.0514645017683506)","(562245018, 0.051895879209041595)","(562245001, 0.06646782159805298)","(673677002, 0.0664808601140976)","(579541001, 0.07416845858097076)"
4,"(706016004, 0.06759428232908249)","(539723001, 0.0753435343503952)","(706016019, 0.08018746972084045)","(539723005, 0.09405452013015747)","(448509014, 0.12291540205478668)","(706016006, 0.2612321972846985)","(706016015, 0.2846921980381012)","(706016003, 0.3867911100387573)","(706016002, 0.9735127091407776)","(706016001, 0.9795500040054321)"


In [6]:
coo_mat = user_item_sparse.tocoo()
indices = torch.tensor(np.vstack((coo_mat.row, coo_mat.col)), dtype=torch.long, device=device)
values = torch.tensor(coo_mat.data, dtype=torch.float32, device=device)

In [7]:
latent_dim = 20
num_epochs = 50
lr = 0.01
reg = 0.1
batch_size = 5000

In [8]:
class ALSModel(nn.Module):
    def __init__(self, num_users, num_items, latent_dim):
        super(ALSModel, self).__init__()
        self.user_factors = nn.Parameter(torch.randn(num_users, latent_dim, device=device) * 0.01)
        self.item_factors = nn.Parameter(torch.randn(num_items, latent_dim, device=device) * 0.01)

    def forward(self, user_batch):
        return torch.matmul(self.user_factors[user_batch], self.item_factors.T)

num_users, num_items = user_item_sparse.shape
model = ALSModel(num_users, num_items, latent_dim).to(device)
optimizer = Adam(model.parameters(), lr=lr)

In [9]:
def als_loss(pred, actual, user_factors, item_factors, reg):
    loss = torch.sum((pred - actual) ** 2)
    loss += reg * (torch.sum(user_factors ** 2) + torch.sum(item_factors ** 2))
    return loss

In [None]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    predicted = model(torch.arange(num_users, device=device))
    actual_sparse = user_item_sparse.tocoo()
    actual_indices = torch.tensor(
        np.vstack((actual_sparse.row, actual_sparse.col)), dtype=torch.long, device=device
    )
    actual_values = torch.tensor(actual_sparse.data, dtype=torch.float32, device=device)
    actual_tensor = torch.sparse_coo_tensor(
        actual_indices, actual_values, torch.Size(actual_sparse.shape), device=device
    ).to_dense()
    
    loss = als_loss(predicted, actual_tensor, model.user_factors, model.item_factors, reg)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

print("ALS Model Training Completed!")
torch.save(model.user_factors, "als_user_factors.pth")
torch.save(model.item_factors, "als_item_factors.pth")

Epoch [10/50], Loss: 50006764.4512
Epoch [20/50], Loss: 362531989.0078
Epoch [30/50], Loss: 131636286.0742
Epoch [40/50], Loss: 260102832.9062
Epoch [50/50], Loss: 105438293.4688
ALS Model Training Completed!


In [None]:
als_user_factors = torch.load("als_user_factors.pth")
als_item_factors = torch.load("als_item_factors.pth")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
als_user_factors = als_user_factors.to(device)
als_item_factors = als_item_factors.to(device)

print("ALS User Factors Shape:", als_user_factors.shape)
print("ALS Item Factors Shape:", als_item_factors.shape)


ALS User Factors Shape: torch.Size([1362281, 20])
ALS Item Factors Shape: torch.Size([104547, 20])


In [None]:
def generate_recommendations(top_n=10):
    all_recommendations = {}
    
    for user_id in range(als_user_factors.shape[0]):
        scores = torch.matmul(als_user_factors[user_id], als_item_factors.T)
        top_items = torch.argsort(scores, descending=True)[:top_n]
        all_recommendations[int(user_id)] = [int(article_id) for article_id in top_items.cpu().numpy()]
    
    return all_recommendations

all_recommendations = generate_recommendations(top_n=10)

In [None]:
reverse_customer_map = {idx: customer for customer, idx in customer_map.items()}
reverse_article_map = {idx: article for article, idx in article_map.items()}

predicted_articles_dict = {
    reverse_customer_map[int(user_id)]: [reverse_article_map[int(article_id)] for article_id in all_recommendations[user_id]]
    for user_id in range(als_user_factors.shape[0])
}

als_df = pd.DataFrame.from_dict(predicted_articles_dict, orient='index')
als_df.to_csv("als_recommendations.csv", index_label="customer_id")

In [None]:
als_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,912472001,867729009,705816002,611258001,893214001,822613005,581113008,608071001,796975001,923755001
00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,583786004,701784032,814312010,730454053,621522008,608071001,819413001,799478002,823922001,723428002
00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4c73235dccbbc132280,660758001,737994021,613064001,565797006,706411004,322017015,743932004,707092001,594387001,608069055
0008968c0d451dbc5a9968da03196fe20051965edde7413775c4eb3be9abe9c2,912472001,705816002,867729009,581113008,923755001,822613005,796975001,581408004,893214001,900658001
000aa7f0dc06cd7174389e76c9e132a67860c5f65f970699daccc14425ac31a8,660758001,706411004,816248004,893948002,742366001,743932004,585955004,378447034,753728001,549917002
