In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
from collections import deque
import random

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Load and process data
def load_data():
    # Read news and behaviors data
    col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']
    news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None, names=col_news)
    
    col_behaviors = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
    behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_behaviors)
    behaviors_df = behaviors_df.sample(frac=0.01, random_state=42)
    
    return news_df, behaviors_df

In [3]:
# Read news and behaviors data
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None, names=col_news)
# news_df = news_df.sample(frac=0.1, random_state=42)
# Read behaviors data
col_behaviors = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_behaviors)
# behaviors_df = behaviors_df.sample(frac=0.01, random_state=42)

In [None]:
news_df.head()

In [4]:
news_df.shape

(51282, 8)

In [6]:
print(news_df['Category'].nunique())

17


In [7]:
print(news_df['SubCat'].nunique())

264


In [8]:
print(behaviors_df['UserID'].nunique())

50000


In [10]:
cat_counts = news_df['Category'].value_counts()
subcat_counts = news_df['SubCat'].value_counts()
print(cat_counts)


Category
news             15774
sports           14510
finance           3107
foodanddrink      2551
lifestyle         2479
travel            2350
video             2068
weather           2048
health            1885
autos             1639
tv                 889
music              769
movies             606
entertainment      587
kids                17
middleeast           2
northamerica         1
Name: count, dtype: int64


In [12]:
for subcat, count in subcat_counts.items():
    print(f"{subcat}: {count:,} articles")

newsus: 6,564 articles
football_nfl: 5,420 articles
newspolitics: 2,826 articles
newscrime: 2,254 articles
weathertopstories: 2,047 articles
newsworld: 1,720 articles
football_ncaa: 1,665 articles
baseball_mlb: 1,661 articles
basketball_nba: 1,555 articles
newsscienceandtechnology: 1,210 articles
news: 1,185 articles
newstrends: 1,176 articles
more_sports: 1,065 articles
travelarticle: 1,042 articles
travelnews: 902 articles
lifestylebuzz: 894 articles
autosnews: 837 articles
basketball_ncaa: 774 articles
financenews: 697 articles
finance-real-estate: 584 articles
finance-companies: 567 articles
icehockey_nhl: 531 articles
medical: 479 articles
recipes: 463 articles
health-news: 459 articles
golf: 446 articles
mma: 437 articles
musicnews: 414 articles
markets: 410 articles
newsoffbeat: 405 articles
tvnews: 377 articles
tv-celebrity: 356 articles
wellness: 340 articles
newsopinion: 315 articles
racing: 299 articles
foodnews: 271 articles
animals: 259 articles
tipsandtricks: 251 articles

In [None]:
behaviors_df.head()

In [5]:
behaviors_df.shape

(156965, 5)

In [7]:
def process_impressions(behaviors_df, news_df):
    """Convert behaviors data into user-news interaction pairs"""
    interactions = []
    
    for _, row in behaviors_df.iterrows():
        user_id = row['UserID']
        history = row['History'].split() if pd.notna(row['History']) else []
        
        # Process each impression
        for impression in row['Impressions'].split():
            news_id, click = impression.split('-')
            
            # Only include if news exists in news_df
            if news_id in news_df['NewsId'].values:
                interactions.append({
                    'user_id': user_id,
                    'news_id': news_id,
                    'click': int(click),
                    'history_len': len(history),
                    'time': row['Time']
                })

    # Convert interactions list to DataFrame
    interactions_df = pd.DataFrame(interactions)
    
    # Separate clicks and no-clicks
    clicks = interactions_df[interactions_df['click'] == 1]
    no_clicks = interactions_df[interactions_df['click'] == 0]
    
    # Downsample no_clicks to match clicks size
    no_clicks_downsampled = no_clicks.sample(n=len(clicks), random_state=42)
    
    # Combine back
    balanced_df = pd.concat([clicks, no_clicks_downsampled])
    
    print(f"Original size: {len(interactions_df)}, Balanced size: {len(balanced_df)}")
    print("Class distribution after balancing:")
    print(balanced_df['click'].value_counts())
    
    return balanced_df

def extract_time_features(time_str):
    """Extract time-based features from timestamp"""
    time = datetime.strptime(time_str, '%m/%d/%Y %I:%M:%S %p')
    return {
        'hour': time.hour,
        'day_of_week': time.weekday(),
        'is_weekend': 1 if time.weekday() >= 5 else 0
    }

def prepare_features(interactions_df, news_df):
    """Prepare features for training"""
    # Encode categorical features
    # le_category = LabelEncoder()
    # le_subcategory = LabelEncoder()
    
    # news_df['category_encoded'] = le_category.fit_transform(news_df['Category'])
    # news_df['subcategory_encoded'] = le_subcategory.fit_transform(news_df['SubCat'])

    # One-hot encode category and subcategory
    category_ohe = pd.get_dummies(news_df['Category'], prefix='cat')
    subcategory_ohe = pd.get_dummies(news_df['SubCat'], prefix='subcat')

    # Concatenate one-hot columns to news_df
    news_df = pd.concat([news_df, category_ohe, subcategory_ohe], axis=1)

    # Merge only necessary columns (include one-hot columns)
    merged_news_cols = ['NewsId'] + list(category_ohe.columns) + list(subcategory_ohe.columns)

    # Merge news features with interactions
    features_df = interactions_df.merge(
        news_df[merged_news_cols],
        left_on='news_id',
        right_on='NewsId'
    )
    
    # Add time features
    time_features = features_df['time'].apply(extract_time_features).apply(pd.Series)
    features_df = pd.concat([features_df, time_features], axis=1)
    
    return features_df

In [None]:
interactions_df = process_impressions(behaviors_df, news_df)
interactions_df.head()


In [None]:
interactions_df.shape

In [None]:
features_df = prepare_features(interactions_df, news_df)
features_df.head()



In [None]:
features_df.shape

In [12]:
# Identify one-hot encoded category/subcategory columns
one_hot_cols = [col for col in features_df.columns if col.startswith('cat_') or col.startswith('subcat_')]

# Identify history embedding columns if included previously
history_emb_cols = [col for col in features_df.columns if col.startswith('hist_emb_')]

# Core context features
base_context_cols = ['history_len', 'hour', 'day_of_week', 'is_weekend']

# Final feature set
X = features_df[base_context_cols + one_hot_cols + history_emb_cols]
y = features_df['click']

In [13]:
# X = features_df[[
#         'history_len',
#         'category_encoded',
#         'subcategory_encoded',
#         'hour',
#         'day_of_week',
#         'is_weekend'
#     ]]
# y = features_df['click']

In [None]:
X.shape

In [None]:
y.shape

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_losses = -log_loss(y_train, lr_model.predict_proba(X_train_scaled))

In [None]:
y_pred = lr_model.predict(X_test_scaled)
    
results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred)
}

print("LR Model Performance:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")

In [20]:
# Read news and behaviors data
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None, names=col_news)
# news_df = news_df.sample(frac=0.1, random_state=42)
# Read behaviors data
col_behaviors = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_behaviors)
behaviors_df = behaviors_df.sample(frac=0.01, random_state=42)

In [21]:
def process_impressions_for_bandit(behaviors_df, news_df):
    """Convert behaviors data into bandit format with contexts and actions"""
    
    # One-hot encode news categories and subcategories
    category_ohe = pd.get_dummies(news_df['Category'], prefix='cat')
    subcategory_ohe = pd.get_dummies(news_df['SubCat'], prefix='subcat')
    
    # Combine news features
    news_features = pd.concat([
        news_df[['NewsId']],
        category_ohe,
        subcategory_ohe
    ], axis=1).set_index('NewsId')
    
    bandit_data = []
    
    for _, row in behaviors_df.iterrows():
        # Extract time features
        time = datetime.strptime(row['Time'], '%m/%d/%Y %I:%M:%S %p')
        context = {
            'history_len': len(row['History'].split()) if pd.notna(row['History']) else 0,
            'hour': time.hour,
            'day_of_week': time.weekday(),
            'is_weekend': 1 if time.weekday() >= 5 else 0
        }
        
        # Process impressions
        impressions = row['Impressions'].split()
        slate = []
        rewards = []
        
        for imp in impressions:
            news_id, click = imp.split('-')
            if news_id in news_features.index:  # Only include if news exists
                slate.append(news_id)
                rewards.append(int(click))
        
        if slate:  # Only include if there are valid articles
            bandit_data.append({
                'user_id': row['UserID'],
                'context': context,
                'slate': slate,
                'rewards': rewards,
                'news_features': news_features.loc[slate].to_dict('records')
            })
    
    return bandit_data, news_features

# Process data
bandit_data, news_features = process_impressions_for_bandit(behaviors_df, news_df)

In [None]:
def prepare_features_for_bandit(context, news_feat):
    """
    Prepare features for LinUCB - combines context and pre-encoded news features
    
    Args:
        context: dictionary with context features (history_len, hour, day_of_week, is_weekend)
        news_feat: dictionary with pre-encoded news features from bandit data
    """
    # Context features
    context_vec = np.array([
        context['history_len'],
        context['hour'],
        context['day_of_week'],
        context['is_weekend']
    ])
    
    # News features are already one-hot encoded in the bandit format
    news_vec = np.array([v for k, v in news_feat.items() if k != 'NewsId'], dtype=float)
    
    return np.concatenate([context_vec, news_vec])

# Create and fit the scaler
scaler = StandardScaler()

# Prepare a sample of all possible feature vectors to fit the scaler
all_feature_vectors = []
for interaction in bandit_data:
    context = interaction['context']
    for news_feat in interaction['news_features']:
        features = prepare_features_for_bandit(context, news_feat)
        all_feature_vectors.append(features)

# Fit the scaler on all possible feature combinations
all_feature_vectors = np.array(all_feature_vectors)
scaler.fit(all_feature_vectors)

In [None]:
# First, create a scaler for the combined feature space
scaler = StandardScaler()

# Prepare a sample of all possible feature vectors to fit the scaler
all_feature_vectors = []
for interaction in bandit_data:
    context = interaction['context']
    for news_feat in interaction['news_features']:
        # Combine context and news features
        features = prepare_features_for_bandit(context, news_feat)
        all_feature_vectors.append(features)

# Fit the scaler on all possible feature combinations
all_feature_vectors = np.array(all_feature_vectors)
scaler.fit(all_feature_vectors)

In [None]:
all_feature_vectors.shape

In [None]:
bandit_data[0]

In [None]:
news_features.head()

In [None]:
news_features.shape

In [32]:
class LinUCB:
    def __init__(self, d, alpha=1.0):
        """
        d: dimension of feature vectors
        alpha: exploration parameter
        """
        self.d = d
        self.alpha = alpha
        self.A = np.identity(d)
        self.b = np.zeros(d)
        self.theta = np.zeros(d)

        # tracking metrics
        self.total_regret = 0
        self.cumulative_regret = []
        
    def get_action(self, context_features, actions_features_scaled):
        """
        Select action using LinUCB
        actions_features_scaled: list of pre-scaled feature vectors
        """
        A_inv = np.linalg.inv(self.A)
        self.theta = A_inv.dot(self.b)
        
        # Compute UCB for each action
        ucb_scores = []
        
        for x in actions_features_scaled:
            # Compute UCB score
            mu = x.dot(self.theta)
            sigma = np.sqrt(x.dot(A_inv).dot(x))
            ucb = mu + self.alpha * sigma
            
            ucb_scores.append(float(ucb))
            
        return np.argmax(ucb_scores)
    
    def update(self, features_scaled, reward, optimal_reward):
        """Update model with observed reward"""
        self.A += np.outer(features_scaled, features_scaled)
        self.b += features_scaled * reward

         # Update regret
        regret = optimal_reward - reward
        self.total_regret += regret
        self.cumulative_regret.append(self.total_regret)

In [29]:
def prepare_features(context, news_feat):
    """Combine context and news features into a single vector"""
    context_vec = np.array([
        context['history_len'],
        context['hour'],
        context['day_of_week'],
        context['is_weekend']
    ])
    
    # Convert news features dict to array, excluding NewsId
    news_vec = np.array([v for k, v in news_feat.items() if k != 'NewsId'])
    
    return np.concatenate([context_vec, news_vec])

def evaluate_linucb(model, eval_data):
    """Evaluate LinUCB on test data"""
    correct = 0
    total = 0
    
    for interaction in eval_data:
        context = interaction['context']
        slate = interaction['slate']
        rewards = interaction['rewards']
        news_features = interaction['news_features']
        
        # Prepare features for each article in slate
        action_features = [
            prepare_features(context, news_feat) 
            for news_feat in news_features
        ]
        
        # Get model prediction
        pred_idx = model.get_action(
            list(context.values()),
            action_features
        )
        
        # Check if prediction matches clicked article
        if rewards[pred_idx] == 1:
            correct += 1
        total += 1
        
    return correct / total

In [30]:
def evaluate_linucb_scaled(model, eval_data, scaler):
    """Evaluate LinUCB on test data with scaled features"""
    correct = 0
    total = 0
    
    for interaction in eval_data:
        context = interaction['context']
        rewards = interaction['rewards']
        
        # Prepare and scale features for each article
        action_features_scaled = []
        for news_feat in interaction['news_features']:
            features = prepare_features(context, news_feat)
            features_scaled = scaler.transform(features.reshape(1, -1))[0]
            action_features_scaled.append(features_scaled)
        
        # Get model prediction
        pred_idx = model.get_action(
            list(context.values()),
            action_features_scaled
        )
        
        # Check if prediction matches clicked article
        if rewards[pred_idx] == 1:
            correct += 1
        total += 1
        
    return correct / total

In [None]:
# Split data into train and test
np.random.seed(42)
train_size = int(0.8 * len(bandit_data))
train_data = bandit_data[:train_size]
test_data = bandit_data[train_size:]

# Initialize LinUCB
d = 4 + len(news_features.columns)  # 4 context features + one-hot features
bandit_model = LinUCB(d=d, alpha=1.0)

# Train the model with scaled features
for interaction in train_data:
    context = interaction['context']
    rewards = interaction['rewards']
    
    # Prepare and scale features for each article
    action_features_scaled = []
    for news_feat in interaction['news_features']:
        # Combine context and news features
        features = prepare_features(context, news_feat)
        # Scale features
        features_scaled = scaler.transform(features.reshape(1, -1))[0]
        action_features_scaled.append(features_scaled)
    
    # Get model prediction
    chosen_idx = bandit_model.get_action(
        list(context.values()),
        action_features_scaled
    )
    
    optimal_reward = max(rewards)
    # Update model with scaled features
    bandit_model.update(
        action_features_scaled[chosen_idx],
        rewards[chosen_idx],
        optimal_reward
    )

# Evaluate model
test_accuracy = evaluate_linucb_scaled(bandit_model, test_data, scaler)
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
# Calculate and print random baseline
avg_slate_size = np.mean([len(interaction['slate']) for interaction in bandit_data])
random_baseline = 1/avg_slate_size
print(f"Random baseline (1/avg_slate_size): {random_baseline:.4f}")

In [18]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [19]:
class DQNetwork(nn.Module):
    def __init__(self, input_dim):
        super(DQNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single output for Q-value
        )
        
    def forward(self, x):
        return self.network(x)

In [20]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        
    def push(self, state, action, reward, next_state):
        self.buffer.append((state, action, reward, next_state))
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)

In [21]:
class DQNAgent:
    def __init__(self, state_dim, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.device = device
        self.state_dim = state_dim
        
        # Networks
        self.policy_net = DQNetwork(state_dim).to(device)
        self.target_net = DQNetwork(state_dim).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
        # Training parameters
        self.optimizer = optim.Adam(self.policy_net.parameters())
        self.criterion = nn.MSELoss()
        self.replay_buffer = ReplayBuffer(10000)
        
        # Hyperparameters
        self.batch_size = 32
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.target_update = 10
        self.steps = 0
        
    def select_action(self, state_features_list):
        """
        Select action using epsilon-greedy policy
        state_features_list: list of feature vectors for each article in slate
        """
        if random.random() < self.epsilon:
            return random.randrange(len(state_features_list))
            
        with torch.no_grad():
            q_values = []
            for features in state_features_list:
                state = torch.FloatTensor(features).to(self.device)
                q_value = self.policy_net(state)
                q_values.append(q_value.item())
            return np.argmax(q_values)
    
    def update(self, batch_size):
        if len(self.replay_buffer) < batch_size:
            return
        
        # Sample batch
        transitions = self.replay_buffer.sample(batch_size)
        states, actions, rewards, next_states = zip(*transitions)
        
        # Convert to tensors
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        
        # Compute Q(s_t, a)
        current_q_values = self.policy_net(states)
        
        # Compute V(s_{t+1}) for all next states
        with torch.no_grad():
            next_q_values = self.target_net(next_states)
            
        # Compute expected Q values
        expected_q_values = rewards + (self.gamma * next_q_values)
        
        # Compute loss
        loss = self.criterion(current_q_values, expected_q_values)
        
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Update target network
        self.steps += 1
        if self.steps % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
            
        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        
        return loss.item()


In [22]:
def process_data_for_dqn(behaviors_df, news_df):
    """Process data into format suitable for DQN training"""
    # One-hot encode categories
    category_ohe = pd.get_dummies(news_df['Category'], prefix='cat')
    subcategory_ohe = pd.get_dummies(news_df['SubCat'], prefix='subcat')
    
    # Combine news features
    news_features = pd.concat([
        news_df[['NewsId']],
        category_ohe,
        subcategory_ohe
    ], axis=1).set_index('NewsId')
    
    interactions = []
    
    for _, row in behaviors_df.iterrows():
        # Extract time features
        time = datetime.strptime(row['Time'], '%m/%d/%Y %I:%M:%S %p')
        context = {
            'history_len': len(row['History'].split()) if pd.notna(row['History']) else 0,
            'hour': time.hour,
            'day_of_week': time.weekday(),
            'is_weekend': 1 if time.weekday() >= 5 else 0
        }
        
        # Process impressions
        impressions = row['Impressions'].split()
        slate = []
        rewards = []
        
        for imp in impressions:
            news_id, click = imp.split('-')
            if news_id in news_features.index:
                slate.append(news_id)
                rewards.append(int(click))
        
        if slate:
            interactions.append({
                'context': context,
                'slate': slate,
                'rewards': rewards,
                'news_features': news_features.loc[slate].to_dict('records')
            })
    
    return interactions, news_features


In [23]:

def prepare_state_features(context, news_feat):
    """Combine context and news features into state representation"""
    context_vec = np.array([
        context['history_len'],
        context['hour'],
        context['day_of_week'],
        context['is_weekend']
    ])
    
    news_vec = np.array([v for k, v in news_feat.items() if k != 'NewsId'])
    return np.concatenate([context_vec, news_vec])

def train_dqn(agent, train_data, scaler, num_epochs=5):
    """Train DQN agent"""
    training_losses = []
    accuracies = []
    
    for epoch in range(num_epochs):
        correct_predictions = 0
        total_predictions = 0
        epoch_losses = []
        
        for interaction in train_data:
            context = interaction['context']
            rewards = interaction['rewards']
            
            # Prepare state features for each article
            state_features = []
            for news_feat in interaction['news_features']:
                features = prepare_state_features(context, news_feat)
                features_scaled = scaler.transform(features.reshape(1, -1))[0]
                state_features.append(features_scaled)
            
            # Select action
            action = agent.select_action(state_features)
            
            # Get reward and update metrics
            reward = rewards[action]
            if reward == 1:
                correct_predictions += 1
            total_predictions += 1
            
            # Store transition in replay buffer
            state = state_features[action]
            next_state = state  # Terminal state, so same as current
            agent.replay_buffer.push(state, action, reward, next_state)
            
            # Update network
            if len(agent.replay_buffer) >= agent.batch_size:
                loss = agent.update(agent.batch_size)
                if loss is not None:
                    epoch_losses.append(loss)
        
        # Calculate metrics
        epoch_accuracy = correct_predictions / total_predictions
        epoch_loss = np.mean(epoch_losses) if epoch_losses else 0
        
        accuracies.append(epoch_accuracy)
        training_losses.append(epoch_loss)
        
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Accuracy: {epoch_accuracy:.4f}")
        print(f"Average Loss: {epoch_loss:.4f}")
        print(f"Epsilon: {agent.epsilon:.4f}")
        print("---")
    
    return training_losses, accuracies

def evaluate_dqn(agent, eval_data, scaler):
    """Evaluate DQN agent"""
    agent.policy_net.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for interaction in eval_data:
            context = interaction['context']
            rewards = interaction['rewards']
            
            # Prepare state features
            state_features = []
            for news_feat in interaction['news_features']:
                features = prepare_state_features(context, news_feat)
                features_scaled = scaler.transform(features.reshape(1, -1))[0]
                state_features.append(features_scaled)
            
            # Get model prediction (no epsilon-greedy during evaluation)
            q_values = []
            for features in state_features:
                state = torch.FloatTensor(features).to(agent.device)
                q_value = agent.policy_net(state)
                q_values.append(q_value.item())
            
            pred_idx = np.argmax(q_values)
            
            if rewards[pred_idx] == 1:
                correct += 1
            total += 1
    
    agent.policy_net.train()
    return correct / total



In [24]:
# Load and process data
news_df, behaviors_df = load_data()
interactions, news_features = process_data_for_dqn(behaviors_df, news_df)

In [None]:
# Create feature scaler
all_features = []
for interaction in interactions:
    context = interaction['context']
    for news_feat in interaction['news_features']:
        features = prepare_state_features(context, news_feat)
        all_features.append(features)

scaler = StandardScaler()
scaler.fit(all_features)

In [26]:
# Split data
train_size = int(0.8 * len(interactions))
train_data = interactions[:train_size]
test_data = interactions[train_size:]

In [27]:
# Initialize DQN agent
state_dim = len(all_features[0])  # 4 context features + one-hot features
agent = DQNAgent(state_dim)

In [None]:
# Train the agent
training_losses, accuracies = train_dqn(agent, train_data, scaler)

In [None]:
# Evaluate
test_accuracy = evaluate_dqn(agent, test_data, scaler)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

In [None]:
# Calculate random baseline
avg_slate_size = np.mean([len(interaction['slate']) for interaction in interactions])
random_baseline = 1/avg_slate_size
print(f"Random baseline (1/avg_slate_size): {random_baseline:.4f}")