### 5. Model Training - Content-Based CNN with GloVe Embeddings

We'll implement a content-based news recommendation system using:
- **CNN architecture** to capture local patterns in news text
- **Pre-trained GloVe embeddings** for word representations
- **Multi-channel approach** processing titles and abstracts separately
- **User interaction data** for training labels

In [34]:
import torch
import sys
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import pickle
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm

In [2]:
# Load prepared dictionaries and embeddings
with open('embeddings/word_dict.pkl', 'rb') as f:
    word_dict = pickle.load(f)
    
with open('embeddings/category_dict.pkl', 'rb') as f:
    category_dict = pickle.load(f)

with open('embeddings/subcategory_dict.pkl', 'rb') as f:
    subcategory_dict = pickle.load(f)

with open('embeddings/uid2index.pkl', 'rb') as f:
    uid2index = pickle.load(f)

# Load GloVe embedding matrix
embedding_matrix = np.load('embeddings/embedding.npy')

print(f"\n=== DATA LOADED ===")
print(f"Vocabulary size: {len(word_dict)}")
print(f"Embedding matrix shape: {embedding_matrix.shape}")
print(f"Number of users: {len(uid2index)}")
print(f"Number of categories: {len(category_dict)}")
print(f"Number of subcategories: {len(subcategory_dict)}")

# Set device with proper priority
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"\nUsing device: {device} (CUDA)")
elif hasattr(torch, 'xpu') and torch.xpu.is_available():
    device = torch.device('xpu')
    print(f"\nUsing device: {device} (Intel XPU)")
else:
    device = torch.device('cpu')
    print(f"\nUsing device: {device} (CPU)")

print(f"Device type: {device.type}")


=== DATA LOADED ===
Vocabulary size: 23761
Embedding matrix shape: (23762, 100)
Number of users: 1001
Number of categories: 16
Number of subcategories: 190

Using device: xpu (Intel XPU)
Device type: xpu


In [3]:
import pandas as pd
import ast
news_data=pd.read_csv('news_data.tsv', sep='\t')
behaviours_data=pd.read_csv('behaviours_data.tsv', sep='\t')
behaviours_data.info()

# Convert string representations of lists to actual lists
for col in ['Clicked_News', 'Non_Clicked_News']:
    behaviours_data[col] = behaviours_data[col].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16606 entries, 0 to 16605
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Impression_ID                   16606 non-null  int64 
 1   User_ID                         16606 non-null  object
 2   Time                            16606 non-null  object
 3   History                         16332 non-null  object
 4   Impressions                     16606 non-null  object
 5   Hour                            16606 non-null  int64 
 6   DayOfWeek                       16606 non-null  object
 7   Clicked_News                    16606 non-null  object
 8   Non_Clicked_News                16606 non-null  object
 9   First_Clicked_News_Category     16606 non-null  object
 10  First_Clicked_News_SubCategory  16606 non-null  object
dtypes: int64(2), object(9)
memory usage: 1.4+ MB


In [27]:
# Preprocessing functions
def text_to_sequence(text_tokens, word_dict, max_len=30):
    """Convert tokenized text to sequence of word indices"""
    sequence = []
    for word in text_tokens[:max_len]:  # Truncate to max_len
        if word in word_dict:
            sequence.append(word_dict[word])
        else:
            sequence.append(0)  # Unknown word
    
    # Pad sequence to max_len
    while len(sequence) < max_len:
        sequence.append(0)
    
    return sequence

def prepare_training_data(news_data, behaviour_data):
    """Prepare training data from news and behaviour data"""
    training_samples = []
    
    # Get user-news interactions
    for _, row in behaviour_data.iterrows():
        user_id = row['User_ID']
        if user_id not in uid2index:
            continue
            
        user_idx = uid2index[user_id]
        
        # Positive samples (clicked news)
        if row['Clicked_News'] and len(row['Clicked_News']) > 0:
            for news_id in row['Clicked_News']:
                news_info = news_data[news_data['News_ID'] == news_id]
                if len(news_info) > 0:
                    training_samples.append({
                        'user_idx': user_idx,
                        'news_id': news_id,
                        'label': 1
                    })
        
        # Negative samples (non-clicked news from impressions)
        if row['Non_Clicked_News'] and len(row['Non_Clicked_News']) > 0:
            # Sample some negative examples (to balance dataset)
            neg_samples = row['Non_Clicked_News'][:len(row['Clicked_News'])] if row['Clicked_News'] else row['Non_Clicked_News'][:5]
            for news_id in neg_samples:
                news_info = news_data[news_data['News_ID'] == news_id]
                if len(news_info) > 0:
                    training_samples.append({
                        'user_idx': user_idx,
                        'news_id': news_id,
                        'label': 0
                    })
    
    return training_samples

print("Preparing training data...")
training_samples = prepare_training_data(news_data, behaviours_data)
print(f"Total training samples: {len(training_samples)}")

# Count positive and negative samples
pos_samples = sum(1 for sample in training_samples if sample['label'] == 1)
neg_samples = len(training_samples) - pos_samples
print(f"Positive samples: {pos_samples}, Negative samples: {neg_samples}")

Preparing training data...
Total training samples: 53979
Positive samples: 26993, Negative samples: 26986


In [28]:
# Dataset class for news recommendation
class NewsRecommendationDataset(Dataset):
    def __init__(self, samples, news_data, word_dict, category_dict, subcategory_dict, max_title_len=30, max_abstract_len=50):
        self.samples = samples
        self.news_data = news_data.set_index('News_ID')
        self.word_dict = word_dict
        self.category_dict = category_dict
        self.subcategory_dict = subcategory_dict
        self.max_title_len = max_title_len
        self.max_abstract_len = max_abstract_len
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        news_id = sample['news_id']
        user_idx = sample['user_idx']
        label = sample['label']
        
        # Get news information
        try:
            news_row = self.news_data.loc[news_id]
            
            # Convert title and abstract to sequences
            title_seq = text_to_sequence(news_row['Title'], self.word_dict, self.max_title_len)
            abstract_seq = text_to_sequence(news_row['Abstract'], self.word_dict, self.max_abstract_len)
            
            # Get category and subcategory indices
            category_idx = self.category_dict.get(news_row['Category'], 0)
            subcategory_idx = self.subcategory_dict.get(news_row['SubCategory'], 0)
            
            return {
                'user_idx': torch.tensor(user_idx, dtype=torch.long),
                'title_seq': torch.tensor(title_seq, dtype=torch.long),
                'abstract_seq': torch.tensor(abstract_seq, dtype=torch.long),
                'category_idx': torch.tensor(category_idx, dtype=torch.long),
                'subcategory_idx': torch.tensor(subcategory_idx, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float)
            }
        except KeyError:
            # Handle missing news (return zeros)
            return {
                'user_idx': torch.tensor(user_idx, dtype=torch.long),
                'title_seq': torch.zeros(self.max_title_len, dtype=torch.long),
                'abstract_seq': torch.zeros(self.max_abstract_len, dtype=torch.long),
                'category_idx': torch.tensor(0, dtype=torch.long),
                'subcategory_idx': torch.tensor(0, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float)
            }

# Create train/validation split
train_samples, val_samples = train_test_split(training_samples, test_size=0.2, random_state=42, stratify=[s['label'] for s in training_samples])

print(f"Training samples: {len(train_samples)}")
print(f"Validation samples: {len(val_samples)}")

# Create datasets and dataloaders
train_dataset = NewsRecommendationDataset(train_samples, news_data, word_dict, category_dict, subcategory_dict)
val_dataset = NewsRecommendationDataset(val_samples, news_data, word_dict, category_dict, subcategory_dict)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

Training samples: 43183
Validation samples: 10796
Training batches: 338
Validation batches: 85


In [29]:
# Content-Based CNN Model
class NewsRecommendationCNN(nn.Module):
    def __init__(self, embedding_matrix, num_users, num_categories, num_subcategories, 
                 embed_dim=100, num_filters=128, filter_sizes=[3, 4, 5], 
                 user_embed_dim=50, category_embed_dim=20, dropout=0.3):
        super(NewsRecommendationCNN, self).__init__()
        
        vocab_size = embedding_matrix.shape[0]
        self.embed_dim = embed_dim
        self.num_filters = num_filters
        
        # Word embeddings (pre-trained GloVe)
        self.word_embedding = nn.Embedding(vocab_size, embed_dim)
        self.word_embedding.weight = nn.Parameter(torch.FloatTensor(embedding_matrix))
        self.word_embedding.weight.requires_grad = True  # Fine-tune embeddings
        
        # User embeddings
        self.user_embedding = nn.Embedding(num_users + 1, user_embed_dim)
        
        # Category embeddings
        self.category_embedding = nn.Embedding(num_categories + 1, category_embed_dim)
        self.subcategory_embedding = nn.Embedding(num_subcategories + 1, category_embed_dim)
        
        # CNN layers for title
        self.title_convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, kernel_size=k) for k in filter_sizes
        ])
        
        # CNN layers for abstract
        self.abstract_convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, kernel_size=k) for k in filter_sizes
        ])
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Calculate final feature dimension
        cnn_output_dim = len(filter_sizes) * num_filters * 2  # title + abstract
        total_dim = cnn_output_dim + user_embed_dim + category_embed_dim * 2
        
        # Final prediction layers
        self.fc = nn.Sequential(
            nn.Linear(total_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, user_idx, title_seq, abstract_seq, category_idx, subcategory_idx):
        batch_size = title_seq.size(0)
        
        # User embeddings
        user_emb = self.user_embedding(user_idx)
        
        # Category embeddings
        cat_emb = self.category_embedding(category_idx)
        subcat_emb = self.subcategory_embedding(subcategory_idx)
        
        # Word embeddings for title and abstract
        title_emb = self.word_embedding(title_seq)  # (batch, seq_len, embed_dim)
        abstract_emb = self.word_embedding(abstract_seq)
        
        # Transpose for CNN (batch, embed_dim, seq_len)
        title_emb = title_emb.transpose(1, 2)
        abstract_emb = abstract_emb.transpose(1, 2)
        
        # Apply CNN to title
        title_conv_outputs = []
        for conv in self.title_convs:
            conv_out = F.relu(conv(title_emb))  # (batch, num_filters, new_seq_len)
            pooled = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)  # (batch, num_filters)
            title_conv_outputs.append(pooled)
        title_features = torch.cat(title_conv_outputs, dim=1)
        
        # Apply CNN to abstract
        abstract_conv_outputs = []
        for conv in self.abstract_convs:
            conv_out = F.relu(conv(abstract_emb))
            pooled = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
            abstract_conv_outputs.append(pooled)
        abstract_features = torch.cat(abstract_conv_outputs, dim=1)
        
        # Combine all features
        combined_features = torch.cat([
            user_emb, title_features, abstract_features, cat_emb, subcat_emb
        ], dim=1)
        
        # Apply dropout
        combined_features = self.dropout(combined_features)
        
        # Final prediction
        output = self.fc(combined_features)
        
        return output.squeeze(1)

# Initialize model
model = NewsRecommendationCNN(
    embedding_matrix=embedding_matrix,
    num_users=len(uid2index),
    num_categories=len(category_dict),
    num_subcategories=len(subcategory_dict),
    embed_dim=100,
    num_filters=128,
    filter_sizes=[3, 4, 5],
    user_embed_dim=50,
    category_embed_dim=20,
    dropout=0.3
).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model initialized successfully!")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model device: {next(model.parameters()).device}")

Model initialized successfully!
Total parameters: 2,974,845
Trainable parameters: 2,974,845
Model device: xpu:0


In [35]:
# Training and evaluation functions
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    for batch in tqdm(train_loader, desc="Training"):
        # Move batch to device
        user_idx = batch['user_idx'].to(device)
        title_seq = batch['title_seq'].to(device)
        abstract_seq = batch['abstract_seq'].to(device)
        category_idx = batch['category_idx'].to(device)
        subcategory_idx = batch['subcategory_idx'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(user_idx, title_seq, abstract_seq, category_idx, subcategory_idx)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Store predictions and labels for metrics
        all_predictions.extend(outputs.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())
    
    avg_loss = total_loss / len(train_loader)
    auc = roc_auc_score(all_labels, all_predictions)
    acc = accuracy_score(all_labels, [1 if p > 0.5 else 0 for p in all_predictions])
    
    return avg_loss, auc, acc

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            # Move batch to device
            user_idx = batch['user_idx'].to(device)
            title_seq = batch['title_seq'].to(device)
            abstract_seq = batch['abstract_seq'].to(device)
            category_idx = batch['category_idx'].to(device)
            subcategory_idx = batch['subcategory_idx'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = model(user_idx, title_seq, abstract_seq, category_idx, subcategory_idx)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            # Store predictions and labels for metrics
            all_predictions.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(val_loader)
    auc = roc_auc_score(all_labels, all_predictions)
    acc = accuracy_score(all_labels, [1 if p > 0.5 else 0 for p in all_predictions])
    
    return avg_loss, auc, acc

# Initialize training components
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

print("Training components initialized!")

Training components initialized!


In [None]:
# Training loop
num_epochs = 20
best_val_auc = 0
best_model_state = None
patience = 5
patience_counter = 0

print("Starting training...")
print(f"Training for {num_epochs} epochs")
print("-" * 60)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Training
    train_loss, train_auc, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # Validation
    val_loss, val_auc, val_acc = evaluate(model, val_loader, criterion, device)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Print metrics
    print(f"Train Loss: {train_loss:.4f}, Train AUC: {train_auc:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val AUC: {val_auc:.4f}, Val Acc: {val_acc:.4f}")
    print(f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
    
    # Save best model
    if val_auc > best_val_auc:
        best_val_auc = val_auc
        best_model_state = model.state_dict().copy()
        patience_counter = 0
        print(f"New best model! AUC: {best_val_auc:.4f}")
    else:
        patience_counter += 1
    
    # Early stopping
    if patience_counter >= patience:
        print(f"Early stopping triggered after {patience} epochs without improvement")
        break
    
    print("-" * 60)

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"Loaded best model with validation AUC: {best_val_auc:.4f}")

# Save the trained model
torch.save({
    'model_state_dict': model.state_dict(),
    'word_dict': word_dict,
    'category_dict': category_dict,
    'subcategory_dict': subcategory_dict,
    'uid2index': uid2index,
    'best_val_auc': best_val_auc,
    'model_config': {
        'embed_dim': 100,
        'num_filters': 128,
        'filter_sizes': [3, 4, 5],
        'user_embed_dim': 50,
        'category_embed_dim': 20,
        'dropout': 0.3
    }
}, 'cnn_model.pth')

print("Training completed!")
print(f"Model saved as 'cnn_model.pth'")
print(f"Best validation AUC: {best_val_auc:.4f}")

Starting training...
Training for 20 epochs
------------------------------------------------------------
Epoch 1/20


Training: 100%|██████████| 338/338 [00:41<00:00,  8.19it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:06<00:00, 12.59it/s]



Train Loss: 0.6901, Train AUC: 0.5421, Train Acc: 0.5300
Val Loss: 0.6799, Val AUC: 0.5959, Val Acc: 0.5713
Learning Rate: 0.001000
New best model! AUC: 0.5959
------------------------------------------------------------
Epoch 2/20


Training: 100%|██████████| 338/338 [00:26<00:00, 12.83it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 24.72it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 24.72it/s]


Train Loss: 0.6781, Train AUC: 0.5975, Train Acc: 0.5709
Val Loss: 0.6783, Val AUC: 0.6143, Val Acc: 0.5774
Learning Rate: 0.001000
New best model! AUC: 0.6143
------------------------------------------------------------
Epoch 3/20


Training: 100%|██████████| 338/338 [00:29<00:00, 11.51it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 25.76it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 25.76it/s]


Train Loss: 0.6725, Train AUC: 0.6140, Train Acc: 0.5824
Val Loss: 0.6677, Val AUC: 0.6244, Val Acc: 0.5894
Learning Rate: 0.001000
New best model! AUC: 0.6244
------------------------------------------------------------
Epoch 4/20


Training: 100%|██████████| 338/338 [00:28<00:00, 11.95it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 27.10it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 27.10it/s]


Train Loss: 0.6663, Train AUC: 0.6306, Train Acc: 0.5934
Val Loss: 0.6686, Val AUC: 0.6276, Val Acc: 0.5823
Learning Rate: 0.001000
New best model! AUC: 0.6276
------------------------------------------------------------
Epoch 5/20


Training: 100%|██████████| 338/338 [00:39<00:00,  8.60it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:08<00:00, 10.55it/s]



Train Loss: 0.6637, Train AUC: 0.6358, Train Acc: 0.5965
Val Loss: 0.6678, Val AUC: 0.6302, Val Acc: 0.5883
Learning Rate: 0.001000
New best model! AUC: 0.6302
------------------------------------------------------------
Epoch 6/20


Training: 100%|██████████| 338/338 [00:38<00:00,  8.81it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 20.09it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 20.09it/s]


Train Loss: 0.6592, Train AUC: 0.6450, Train Acc: 0.6028
Val Loss: 0.6638, Val AUC: 0.6337, Val Acc: 0.5898
Learning Rate: 0.001000
New best model! AUC: 0.6337
------------------------------------------------------------
Epoch 7/20


Training: 100%|██████████| 338/338 [00:31<00:00, 10.56it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 26.36it/s]



Train Loss: 0.6571, Train AUC: 0.6498, Train Acc: 0.6054
Val Loss: 0.6642, Val AUC: 0.6323, Val Acc: 0.5919
Learning Rate: 0.001000
------------------------------------------------------------
Epoch 8/20


Training: 100%|██████████| 338/338 [00:39<00:00,  8.55it/s]

Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.15it/s]



Train Loss: 0.6543, Train AUC: 0.6551, Train Acc: 0.6114
Val Loss: 0.6633, Val AUC: 0.6360, Val Acc: 0.5974
Learning Rate: 0.001000
New best model! AUC: 0.6360
------------------------------------------------------------
Epoch 9/20


Training: 100%|██████████| 338/338 [00:40<00:00,  8.43it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.49it/s]



Train Loss: 0.6504, Train AUC: 0.6598, Train Acc: 0.6128
Val Loss: 0.6649, Val AUC: 0.6356, Val Acc: 0.6002
Learning Rate: 0.001000
------------------------------------------------------------
Epoch 10/20


Training: 100%|██████████| 338/338 [00:44<00:00,  7.60it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.67it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.67it/s]


Train Loss: 0.6467, Train AUC: 0.6658, Train Acc: 0.6151
Val Loss: 0.6668, Val AUC: 0.6348, Val Acc: 0.5977
Learning Rate: 0.001000
------------------------------------------------------------
Epoch 11/20


Training: 100%|██████████| 338/338 [00:38<00:00,  8.72it/s]

Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.35it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.35it/s]


Train Loss: 0.6454, Train AUC: 0.6691, Train Acc: 0.6182
Val Loss: 0.6655, Val AUC: 0.6352, Val Acc: 0.5963
Learning Rate: 0.001000
------------------------------------------------------------
Epoch 12/20


Training: 100%|██████████| 338/338 [00:40<00:00,  8.34it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.41it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 18.41it/s]


Train Loss: 0.6404, Train AUC: 0.6753, Train Acc: 0.6205
Val Loss: 0.6780, Val AUC: 0.6370, Val Acc: 0.6003
Learning Rate: 0.000500
New best model! AUC: 0.6370
------------------------------------------------------------
Epoch 13/20


Training: 100%|██████████| 338/338 [00:48<00:00,  7.00it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 19.00it/s]
Evaluating: 100%|██████████| 85/85 [00:04<00:00, 19.00it/s]


Train Loss: 0.6328, Train AUC: 0.6861, Train Acc: 0.6261
Val Loss: 0.6680, Val AUC: 0.6374, Val Acc: 0.6002
Learning Rate: 0.000500
New best model! AUC: 0.6374
------------------------------------------------------------
Epoch 14/20


Training: 100%|██████████| 338/338 [00:39<00:00,  8.49it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:02<00:00, 28.98it/s]
Evaluating: 100%|██████████| 85/85 [00:02<00:00, 28.98it/s]


Train Loss: 0.6299, Train AUC: 0.6928, Train Acc: 0.6337
Val Loss: 0.6722, Val AUC: 0.6341, Val Acc: 0.5959
Learning Rate: 0.000500
------------------------------------------------------------
Epoch 15/20


Training: 100%|██████████| 338/338 [00:28<00:00, 11.84it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 23.11it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 23.11it/s]


Train Loss: 0.6284, Train AUC: 0.6930, Train Acc: 0.6345
Val Loss: 0.6696, Val AUC: 0.6374, Val Acc: 0.5998
Learning Rate: 0.000500
------------------------------------------------------------
Epoch 16/20


Training: 100%|██████████| 338/338 [00:29<00:00, 11.46it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 23.42it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 23.42it/s]


Train Loss: 0.6257, Train AUC: 0.6983, Train Acc: 0.6357
Val Loss: 0.6697, Val AUC: 0.6369, Val Acc: 0.6013
Learning Rate: 0.000250
------------------------------------------------------------
Epoch 17/20


Training: 100%|██████████| 338/338 [00:41<00:00,  8.06it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 27.21it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 27.21it/s]


Train Loss: 0.6198, Train AUC: 0.7059, Train Acc: 0.6405
Val Loss: 0.6744, Val AUC: 0.6363, Val Acc: 0.6005
Learning Rate: 0.000250
------------------------------------------------------------
Epoch 18/20


Training: 100%|██████████| 338/338 [00:28<00:00, 11.78it/s]
Evaluating:   0%|          | 0/85 [00:00<?, ?it/s]
Evaluating: 100%|██████████| 85/85 [00:03<00:00, 24.70it/s]



Train Loss: 0.6174, Train AUC: 0.7110, Train Acc: 0.6480
Val Loss: 0.6776, Val AUC: 0.6331, Val Acc: 0.5961
Learning Rate: 0.000250
Early stopping triggered after 5 epochs without improvement
Loaded best model with validation AUC: 0.6374
Training completed!
Model saved as 'news_recommendation_cnn_model.pth'
Best validation AUC: 0.6374


In [44]:
# Model Evaluation: Personalized News Recommendations
def recommend_for_user(user_id, model_path, news_data, word_dict, category_dict, subcategory_dict, uid2index, device, top_k=10):
    """Recommend top news articles for a given user_id using the trained CNN model."""
    
    # Check if user exists
    if user_id not in uid2index:
        print(f"User_ID {user_id} not found in the dataset.")
        return []
    
    # Load the trained model
    print(f"Loading model from {model_path}...")
    checkpoint = torch.load(model_path, map_location=device, weights_only=False)
    model_config = checkpoint['model_config']
    
    # Initialize model with saved configuration
    model = NewsRecommendationCNN(
        embedding_matrix=embedding_matrix,
        num_users=len(uid2index),
        num_categories=len(category_dict),
        num_subcategories=len(subcategory_dict),
        **model_config
    ).to(device)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    print(f"Model loaded successfully! Best validation AUC: {checkpoint['best_val_auc']:.4f}")
    
    user_idx = uid2index[user_id]
    
    # Prepare all news articles as candidates
    print("Preparing candidate news articles...")
    candidates = []
    for _, news_row in news_data.iterrows():
        news_id = news_row['News_ID']
        title_seq = text_to_sequence(news_row['Title'], word_dict, max_len=30)
        abstract_seq = text_to_sequence(news_row['Abstract'], word_dict, max_len=50)
        category_idx = category_dict.get(news_row['Category'], 0)
        subcategory_idx = subcategory_dict.get(news_row['SubCategory'], 0)
        
        candidates.append({
            'news_id': news_id,
            'title': news_row['Title'],
            'category': news_row['Category'],
            'subcategory': news_row['SubCategory'],
            'title_seq': title_seq,
            'abstract_seq': abstract_seq,
            'category_idx': category_idx,
            'subcategory_idx': subcategory_idx
        })
    
    # Batch inference for efficiency
    print("Computing recommendation scores...")
    batch_size = 128
    scores = []
    
    with torch.no_grad():
        for i in range(0, len(candidates), batch_size):
            batch = candidates[i:i+batch_size]
            batch_size_actual = len(batch)
            
            # Prepare batch tensors
            user_idxs = torch.tensor([user_idx] * batch_size_actual, dtype=torch.long, device=device)
            title_seqs = torch.tensor([c['title_seq'] for c in batch], dtype=torch.long, device=device)
            abstract_seqs = torch.tensor([c['abstract_seq'] for c in batch], dtype=torch.long, device=device)
            category_idxs = torch.tensor([c['category_idx'] for c in batch], dtype=torch.long, device=device)
            subcategory_idxs = torch.tensor([c['subcategory_idx'] for c in batch], dtype=torch.long, device=device)
            
            # Get predictions
            outputs = model(user_idxs, title_seqs, abstract_seqs, category_idxs, subcategory_idxs)
            scores.extend(outputs.cpu().numpy())
    
    # Combine news with their scores and sort by score (descending)
    news_with_scores = []
    for i, candidate in enumerate(candidates):
        news_with_scores.append({
            'news_id': candidate['news_id'],
            'title': candidate['title'],
            'category': candidate['category'],
            'subcategory': candidate['subcategory'],
            'score': scores[i]
        })
    
    # Sort by score in descending order
    news_with_scores.sort(key=lambda x: x['score'], reverse=True)
    
    # Display top recommendations
    print(f"\n" + "="*80)
    print(f"TOP {top_k} NEWS RECOMMENDATIONS FOR USER: {user_id}")
    print("="*80)
    
    for i, news in enumerate(news_with_scores[:top_k], 1):
        print(f"{i:2d}. Score: {news['score']:.4f} | {news['category']} > {news['subcategory']}")
        print(f"    News ID: {news['news_id']}")
        print(f"    Title: {news['title']}")
        print("-" * 80)
    
    return news_with_scores[:top_k]

# Example: Get recommendations for a sample user
print("Available users (first 10):")
sample_users = [u for u in uid2index.keys() if isinstance(u, str) and u.strip() and u.lower() != 'user_id'][:10]
for i, user in enumerate(sample_users, 1):
    print(f"{i:2d}. {user}")

# Use the first user as an example
sample_user_id = sample_users[0]
print(f"\nGenerating recommendations for User ID: {sample_user_id}")

# Get recommendations
recommendations = recommend_for_user(
    user_id=sample_user_id, 
    model_path='cnn_model.pth', 
    news_data=news_data, 
    word_dict=word_dict, 
    category_dict=category_dict, 
    subcategory_dict=subcategory_dict, 
    uid2index=uid2index, 
    device=device, 
    top_k=10
)

Available users (first 10):
 1. U91836
 2. U19739
 3. U89744
 4. U29155
 5. U70879
 6. U9306
 7. U49572
 8. U40466
 9. U39029
10. U22930

Generating recommendations for User ID: U91836
Loading model from cnn_model.pth...
Model loaded successfully! Best validation AUC: 0.6374
Preparing candidate news articles...
Model loaded successfully! Best validation AUC: 0.6374
Preparing candidate news articles...
Computing recommendation scores...
Computing recommendation scores...

TOP 10 NEWS RECOMMENDATIONS FOR USER: U91836
 1. Score: 0.8961 | music > musicnews
    News ID: N49279
    Title: Broadway Actress Laurel Griggs Dies at Age 13
--------------------------------------------------------------------------------
 2. Score: 0.8842 | travel > travelnews
    News ID: N8373
    Title: Video shows a Boeing 737 plane carrying 196 people burst into flames just after landing at an Egyptian airport
--------------------------------------------------------------------------------
 3. Score: 0.8522 | m