In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
# Read news and behaviors data
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None, names=col_news)
# news_df = news_df.sample(frac=0.1, random_state=42)
# Read behaviors data
col_behaviors = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_behaviors)
behaviors_df = behaviors_df.sample(frac=0.01, random_state=42)

In [74]:
news_df.head()

Unnamed: 0,NewsId,Category,SubCat,Title,Abstract,url,TitleEnt,AbstractEnt
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [75]:
news_df.shape

(51282, 8)

In [76]:
behaviors_df.head()

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
78833,78834,U46778,11/12/2019 6:57:31 PM,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N60456-0 N19592-0 N35738-1 N3123-0 N12029-0 N3...
119940,119941,U19033,11/14/2019 12:43:30 PM,N1150 N53234 N45794 N11005 N55556,N50872-0 N19536-1
105873,105874,U30,11/9/2019 8:40:24 AM,N58642 N65000 N871 N24778 N11101 N9798 N54827 ...,N5889-0 N63583-1 N56396-0 N37190-0 N5957-0 N73...
36249,36250,U76070,11/10/2019 9:16:45 PM,N61864 N42946 N18693 N62703 N41792 N57888 N236...,N33981-0 N17104-1 N35388-0 N47933-0 N53907-0 N...
81978,81979,U54617,11/11/2019 8:16:50 PM,N24002 N13427 N36517 N33713 N19152 N31127 N167...,N2350-0 N44422-0 N23174-0 N25722-0 N55204-0 N7...


In [77]:
behaviors_df.shape

(1570, 5)

In [78]:
def process_impressions(behaviors_df, news_df):
    """Convert behaviors data into user-news interaction pairs"""
    interactions = []
    
    for _, row in behaviors_df.iterrows():
        user_id = row['UserID']
        history = row['History'].split() if pd.notna(row['History']) else []
        
        # Process each impression
        for impression in row['Impressions'].split():
            news_id, click = impression.split('-')
            
            # Only include if news exists in news_df
            if news_id in news_df['NewsId'].values:
                interactions.append({
                    'user_id': user_id,
                    'news_id': news_id,
                    'click': int(click),
                    'history_len': len(history),
                    'time': row['Time']
                })

    # Convert interactions list to DataFrame
    interactions_df = pd.DataFrame(interactions)
    
    # Separate clicks and no-clicks
    clicks = interactions_df[interactions_df['click'] == 1]
    no_clicks = interactions_df[interactions_df['click'] == 0]
    
    # Downsample no_clicks to match clicks size
    no_clicks_downsampled = no_clicks.sample(n=len(clicks), random_state=42)
    
    # Combine back
    balanced_df = pd.concat([clicks, no_clicks_downsampled])
    
    print(f"Original size: {len(interactions_df)}, Balanced size: {len(balanced_df)}")
    print("Class distribution after balancing:")
    print(balanced_df['click'].value_counts())
    
    return balanced_df

def extract_time_features(time_str):
    """Extract time-based features from timestamp"""
    time = datetime.strptime(time_str, '%m/%d/%Y %I:%M:%S %p')
    return {
        'hour': time.hour,
        'day_of_week': time.weekday(),
        'is_weekend': 1 if time.weekday() >= 5 else 0
    }

def prepare_features(interactions_df, news_df):
    """Prepare features for training"""
    # Encode categorical features
    # le_category = LabelEncoder()
    # le_subcategory = LabelEncoder()
    
    # news_df['category_encoded'] = le_category.fit_transform(news_df['Category'])
    # news_df['subcategory_encoded'] = le_subcategory.fit_transform(news_df['SubCat'])

    # One-hot encode category and subcategory
    category_ohe = pd.get_dummies(news_df['Category'], prefix='cat')
    subcategory_ohe = pd.get_dummies(news_df['SubCat'], prefix='subcat')

    # Concatenate one-hot columns to news_df
    news_df = pd.concat([news_df, category_ohe, subcategory_ohe], axis=1)

    # Merge only necessary columns (include one-hot columns)
    merged_news_cols = ['NewsId'] + list(category_ohe.columns) + list(subcategory_ohe.columns)

    # Merge news features with interactions
    features_df = interactions_df.merge(
        news_df[merged_news_cols],
        left_on='news_id',
        right_on='NewsId'
    )
    
    # Add time features
    time_features = features_df['time'].apply(extract_time_features).apply(pd.Series)
    features_df = pd.concat([features_df, time_features], axis=1)
    
    return features_df

In [79]:
interactions_df = process_impressions(behaviors_df, news_df)
interactions_df.head()


Original size: 58161, Balanced size: 4830
Class distribution after balancing:
click
1    2415
0    2415
Name: count, dtype: int64


Unnamed: 0,user_id,news_id,click,history_len,time
2,U46778,N35738,1,43,11/12/2019 6:57:31 PM
11,U19033,N19536,1,5,11/14/2019 12:43:30 PM
13,U30,N63583,1,9,11/9/2019 8:40:24 AM
50,U76070,N17104,1,17,11/10/2019 9:16:45 PM
90,U54617,N57535,1,11,11/11/2019 8:16:50 PM


In [80]:
interactions_df.shape

(4830, 5)

In [81]:
features_df = prepare_features(interactions_df, news_df)
features_df.head()



Unnamed: 0,user_id,news_id,click,history_len,time,NewsId,cat_autos,cat_entertainment,cat_finance,cat_foodanddrink,...,subcat_weathertopstories,subcat_weight-loss,subcat_weightloss,subcat_wellness,subcat_wines,subcat_wonder,subcat_yearinoffbeatgoodnews,hour,day_of_week,is_weekend
0,U46778,N35738,1,43,11/12/2019 6:57:31 PM,N35738,False,False,False,True,...,False,False,False,False,False,False,False,18,1,0
1,U50203,N35738,1,16,11/12/2019 4:00:37 AM,N35738,False,False,False,True,...,False,False,False,False,False,False,False,4,1,0
2,U92576,N35738,0,23,11/12/2019 10:52:25 PM,N35738,False,False,False,True,...,False,False,False,False,False,False,False,22,1,0
3,U42526,N35738,0,74,11/11/2019 3:14:49 PM,N35738,False,False,False,True,...,False,False,False,False,False,False,False,15,0,0
4,U832,N35738,0,88,11/10/2019 7:15:37 PM,N35738,False,False,False,True,...,False,False,False,False,False,False,False,19,6,1


In [82]:
features_df.shape

(4830, 290)

In [83]:
# Identify one-hot encoded category/subcategory columns
one_hot_cols = [col for col in features_df.columns if col.startswith('cat_') or col.startswith('subcat_')]

# Identify history embedding columns if included previously
history_emb_cols = [col for col in features_df.columns if col.startswith('hist_emb_')]

# Core context features
base_context_cols = ['history_len', 'hour', 'day_of_week', 'is_weekend']

# Final feature set
X = features_df[base_context_cols + one_hot_cols + history_emb_cols]
y = features_df['click']

In [84]:
# X = features_df[[
#         'history_len',
#         'category_encoded',
#         'subcategory_encoded',
#         'hour',
#         'day_of_week',
#         'is_weekend'
#     ]]
# y = features_df['click']

In [85]:
X.shape

(4830, 285)

In [86]:
y.shape

(4830,)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

In [88]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [89]:
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

In [90]:
y_pred = model.predict(X_test_scaled)
    
results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred)
}

print("Model Performance:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")

Model Performance:
accuracy: 0.5631
precision: 0.5524
recall: 0.6172
f1: 0.5830


In [123]:
def process_impressions_for_bandit(behaviors_df, news_df):
    """Convert behaviors data into bandit format with contexts and actions"""
    
    # One-hot encode news categories and subcategories
    category_ohe = pd.get_dummies(news_df['Category'], prefix='cat')
    subcategory_ohe = pd.get_dummies(news_df['SubCat'], prefix='subcat')
    
    # Combine news features
    news_features = pd.concat([
        news_df[['NewsId']],
        category_ohe,
        subcategory_ohe
    ], axis=1).set_index('NewsId')
    
    bandit_data = []
    
    for _, row in behaviors_df.iterrows():
        # Extract time features
        time = datetime.strptime(row['Time'], '%m/%d/%Y %I:%M:%S %p')
        context = {
            'history_len': len(row['History'].split()) if pd.notna(row['History']) else 0,
            'hour': time.hour,
            'day_of_week': time.weekday(),
            'is_weekend': 1 if time.weekday() >= 5 else 0
        }
        
        # Process impressions
        impressions = row['Impressions'].split()
        slate = []
        rewards = []
        
        for imp in impressions:
            news_id, click = imp.split('-')
            if news_id in news_features.index:  # Only include if news exists
                slate.append(news_id)
                rewards.append(int(click))
        
        if slate:  # Only include if there are valid articles
            bandit_data.append({
                'user_id': row['UserID'],
                'context': context,
                'slate': slate,
                'rewards': rewards,
                'news_features': news_features.loc[slate].to_dict('records')
            })
    
    return bandit_data, news_features

# Process data
bandit_data, news_features = process_impressions_for_bandit(behaviors_df, news_df)

In [124]:
bandit_data[0]

{'user_id': 'U46778',
 'context': {'history_len': 43, 'hour': 18, 'day_of_week': 1, 'is_weekend': 0},
 'slate': ['N60456',
  'N19592',
  'N35738',
  'N3123',
  'N12029',
  'N34579',
  'N4754',
  'N50415',
  'N7121',
  'N26262'],
 'rewards': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 'news_features': [{'cat_autos': False,
   'cat_entertainment': False,
   'cat_finance': False,
   'cat_foodanddrink': False,
   'cat_health': False,
   'cat_kids': False,
   'cat_lifestyle': False,
   'cat_middleeast': False,
   'cat_movies': False,
   'cat_music': False,
   'cat_news': True,
   'cat_northamerica': False,
   'cat_sports': False,
   'cat_travel': False,
   'cat_tv': False,
   'cat_video': False,
   'cat_weather': False,
   'subcat_ads-latingrammys': False,
   'subcat_ads-lung-health': False,
   'subcat_advice': False,
   'subcat_animals': False,
   'subcat_autosbuying': False,
   'subcat_autoscartech': False,
   'subcat_autosclassics': False,
   'subcat_autoscompact': False,
   'subcat_autosenthusiast

In [125]:
news_features.head()

Unnamed: 0_level_0,cat_autos,cat_entertainment,cat_finance,cat_foodanddrink,cat_health,cat_kids,cat_lifestyle,cat_middleeast,cat_movies,cat_music,...,subcat_voices,subcat_watch,subcat_weatherfullscreenmaps,subcat_weathertopstories,subcat_weight-loss,subcat_weightloss,subcat_wellness,subcat_wines,subcat_wonder,subcat_yearinoffbeatgoodnews
NewsId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
N19639,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
N61837,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
N53526,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
N38324,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [126]:
news_features.shape

(51282, 281)

In [127]:
class LinUCB:
    def __init__(self, d, alpha=1.0):
        """
        d: dimension of feature vectors
        alpha: exploration parameter
        """
        self.d = d
        self.alpha = alpha
        
        # Shared parameter version of LinUCB
        self.A = np.identity(d)  # A = D^T * D
        self.b = np.zeros((d, 1))  # b = D^T * c
        self.theta = np.zeros((d, 1))  # theta = A^-1 * b
        
    def get_action(self, context_features, actions_features):
        """
        Select action using LinUCB
        context_features: array of context features
        actions_features: list of action feature arrays
        """
        A_inv = np.linalg.inv(self.A)
        self.theta = A_inv.dot(self.b)
        
        # Compute UCB for each action
        ucb_scores = []
        context = np.array(context_features).reshape(-1, 1)
        
        for x in actions_features:
            
            # Compute UCB score
            mu = x.T.dot(self.theta)
            sigma = np.sqrt(x.T.dot(A_inv).dot(x))
            ucb = mu + self.alpha * sigma
            
            ucb_scores.append(float(ucb))
            
        # Return index of action with highest UCB
        return np.argmax(ucb_scores)
    
    def update(self, features, reward):
        """Update model with observed reward"""
        features = np.array(features).reshape(-1, 1)
        self.A += features.dot(features.T)
        self.b += features * reward

In [128]:
def prepare_features(context, news_feat):
    """Combine context and news features into a single vector"""
    context_vec = np.array([
        context['history_len'],
        context['hour'],
        context['day_of_week'],
        context['is_weekend']
    ])
    
    # Convert news features dict to array, excluding NewsId
    news_vec = np.array([v for k, v in news_feat.items() if k != 'NewsId'])
    
    return np.concatenate([context_vec, news_vec])

def evaluate_linucb(model, eval_data):
    """Evaluate LinUCB on test data"""
    correct = 0
    total = 0
    
    for interaction in eval_data:
        context = interaction['context']
        slate = interaction['slate']
        rewards = interaction['rewards']
        news_features = interaction['news_features']
        
        # Prepare features for each article in slate
        action_features = [
            prepare_features(context, news_feat) 
            for news_feat in news_features
        ]
        
        # Get model prediction
        pred_idx = model.get_action(
            list(context.values()),
            action_features
        )
        
        # Check if prediction matches clicked article
        if rewards[pred_idx] == 1:
            correct += 1
        total += 1
        
    return correct / total

In [129]:
# Split data into train and test
np.random.seed(42)
train_size = int(0.8 * len(bandit_data))
train_data = bandit_data[:train_size]
test_data = bandit_data[train_size:]

# Initialize LinUCB
# Feature dimension = context features + action features
d = 4 + len(news_features.columns)  # 4 context features + one-hot features
model = LinUCB(d=d, alpha=1.0)

# Train the model
for interaction in train_data:
    context = interaction['context']
    slate = interaction['slate']
    rewards = interaction['rewards']
    news_features = interaction['news_features']
    
    # Prepare features for each article
    action_features = [
        prepare_features(context, news_feat) 
        for news_feat in news_features
    ]


    # Get model prediction
    chosen_idx = model.get_action(
        list(context.values()),
        action_features
    )
    
    # Update model with observed reward
    model.update(
        action_features[chosen_idx],
        rewards[chosen_idx]
    )

# Evaluate model
test_accuracy = evaluate_linucb(model, test_data)
print(f"Test Accuracy: {test_accuracy:.4f}")

  ucb_scores.append(float(ucb))


Test Accuracy: 0.1019
