In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# Read news and behaviors data
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t', header=None, names=col_news)
# news_df = news_df.sample(frac=0.1, random_state=42)
# Read behaviors data
col_behaviors = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=col_behaviors)
behaviors_df = behaviors_df.sample(frac=0.01, random_state=42)

In [29]:
news_df.head()

Unnamed: 0,NewsId,Category,SubCat,Title,Abstract,url,TitleEnt,AbstractEnt
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [30]:
news_df.shape

(51282, 8)

In [31]:
behaviors_df.head()

Unnamed: 0,ImpressionID,UserID,Time,History,Impressions
78833,78834,U46778,11/12/2019 6:57:31 PM,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N60456-0 N19592-0 N35738-1 N3123-0 N12029-0 N3...
119940,119941,U19033,11/14/2019 12:43:30 PM,N1150 N53234 N45794 N11005 N55556,N50872-0 N19536-1
105873,105874,U30,11/9/2019 8:40:24 AM,N58642 N65000 N871 N24778 N11101 N9798 N54827 ...,N5889-0 N63583-1 N56396-0 N37190-0 N5957-0 N73...
36249,36250,U76070,11/10/2019 9:16:45 PM,N61864 N42946 N18693 N62703 N41792 N57888 N236...,N33981-0 N17104-1 N35388-0 N47933-0 N53907-0 N...
81978,81979,U54617,11/11/2019 8:16:50 PM,N24002 N13427 N36517 N33713 N19152 N31127 N167...,N2350-0 N44422-0 N23174-0 N25722-0 N55204-0 N7...


In [32]:
behaviors_df.shape

(1570, 5)

In [33]:
def process_impressions(behaviors_df, news_df):
    """Convert behaviors data into user-news interaction pairs"""
    interactions = []
    
    for _, row in behaviors_df.iterrows():
        user_id = row['UserID']
        history = row['History'].split() if pd.notna(row['History']) else []
        
        # Process each impression
        for impression in row['Impressions'].split():
            news_id, click = impression.split('-')
            
            # Only include if news exists in news_df
            if news_id in news_df['NewsId'].values:
                interactions.append({
                    'user_id': user_id,
                    'news_id': news_id,
                    'click': int(click),
                    'history_len': len(history),
                    'time': row['Time']
                })

    # Convert interactions list to DataFrame
    interactions_df = pd.DataFrame(interactions)
    
    # Separate clicks and no-clicks
    clicks = interactions_df[interactions_df['click'] == 1]
    no_clicks = interactions_df[interactions_df['click'] == 0]
    
    # Downsample no_clicks to match clicks size
    no_clicks_downsampled = no_clicks.sample(n=len(clicks), random_state=42)
    
    # Combine back
    balanced_df = pd.concat([clicks, no_clicks_downsampled])
    
    print(f"Original size: {len(interactions_df)}, Balanced size: {len(balanced_df)}")
    print("Class distribution after balancing:")
    print(balanced_df['click'].value_counts())
    
    return balanced_df

def extract_time_features(time_str):
    """Extract time-based features from timestamp"""
    time = datetime.strptime(time_str, '%m/%d/%Y %I:%M:%S %p')
    return {
        'hour': time.hour,
        'day_of_week': time.weekday(),
        'is_weekend': 1 if time.weekday() >= 5 else 0
    }

def prepare_features(interactions_df, news_df):
    """Prepare features for training"""
    # Encode categorical features
    le_category = LabelEncoder()
    le_subcategory = LabelEncoder()
    
    news_df['category_encoded'] = le_category.fit_transform(news_df['Category'])
    news_df['subcategory_encoded'] = le_subcategory.fit_transform(news_df['SubCat'])
    
    # Merge news features with interactions
    features_df = interactions_df.merge(
        news_df[['NewsId', 'category_encoded', 'subcategory_encoded']], 
        left_on='news_id', 
        right_on='NewsId'
    )
    
    # Add time features
    time_features = features_df['time'].apply(extract_time_features).apply(pd.Series)
    features_df = pd.concat([features_df, time_features], axis=1)
    
    return features_df

In [34]:
interactions_df = process_impressions(behaviors_df, news_df)
interactions_df.head()


Original size: 58161, Balanced size: 4830
Class distribution after balancing:
click
1    2415
0    2415
Name: count, dtype: int64


Unnamed: 0,user_id,news_id,click,history_len,time
2,U46778,N35738,1,43,11/12/2019 6:57:31 PM
11,U19033,N19536,1,5,11/14/2019 12:43:30 PM
13,U30,N63583,1,9,11/9/2019 8:40:24 AM
50,U76070,N17104,1,17,11/10/2019 9:16:45 PM
90,U54617,N57535,1,11,11/11/2019 8:16:50 PM


In [35]:
interactions_df.shape

(4830, 5)

In [36]:
features_df = prepare_features(interactions_df, news_df)
features_df.head()



Unnamed: 0,user_id,news_id,click,history_len,time,NewsId,category_encoded,subcategory_encoded,hour,day_of_week,is_weekend
0,U46778,N35738,1,43,11/12/2019 6:57:31 PM,N35738,3,94,18,1,0
1,U50203,N35738,1,16,11/12/2019 4:00:37 AM,N35738,3,94,4,1,0
2,U92576,N35738,0,23,11/12/2019 10:52:25 PM,N35738,3,94,22,1,0
3,U42526,N35738,0,74,11/11/2019 3:14:49 PM,N35738,3,94,15,0,0
4,U832,N35738,0,88,11/10/2019 7:15:37 PM,N35738,3,94,19,6,1


In [37]:
features_df.shape

(4830, 11)

In [38]:
X = features_df[[
        'history_len',
        'category_encoded',
        'subcategory_encoded',
        'hour',
        'day_of_week',
        'is_weekend'
    ]]
y = features_df['click']

In [39]:
X.shape

(4830, 6)

In [40]:
y.shape

(4830,)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

In [44]:
y_pred = model.predict(X_test_scaled)
    
results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred)
}

print("Model Performance:")
for metric, value in results.items():
    print(f"{metric}: {value:.4f}")

Model Performance:
accuracy: 0.5404
precision: 0.5311
recall: 0.6067
f1: 0.5664
