# Import necessary libraries

In [53]:
from pathlib import Path
import os
import pandas as pd
from collections import Counter, deque
import time
import json
import zipfile

# Read the data

In [54]:
# Defining column names
col_behaviors = ['ImpressionId', 'User', 'Time', 'History', 'Impressions']
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']

# Read TSV files with Pandas
behaviors_train = pd.read_csv("data/train/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_train = pd.read_csv("data/train/news.tsv", sep="\t", header=None, names=col_news)

behaviors_val = pd.read_csv("data/validation/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_val = pd.read_csv("data/validation/news.tsv", sep="\t", header=None, names=col_news)

behaviors_test = pd.read_csv("data/test/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_test = pd.read_csv("data/test/news.tsv", sep="\t", header=None, names=col_news)

# zip train and val files
behaviors_train_val = pd.concat([behaviors_train, behaviors_val])
news_train_val = pd.concat([news_train, news_val])

# Convert time column to timestamp and sort by time
behaviors_train_val['Timestamp'] = behaviors_train_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_train_val = behaviors_train_val.sort_values(by='Timestamp')

# Convert time column to timestamp and sort by time
behaviors_val['Timestamp'] = behaviors_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_val = behaviors_val.sort_values(by='Timestamp')



In [55]:
# transform date column to datetime
# behaviors_train['Time'] = pd.to_datetime(behaviors_train['Time'])
# behaviors_val['Time'] = pd.to_datetime(behaviors_val['Time'])
# behaviors_test['Time'] = pd.to_datetime(behaviors_test['Time'])

# Implement baseline model

In [56]:
# Dictionary to store news clicks with timestamps
news_clicks = {}
TIME_WINDOW = 86400  # Keep clicks from the last 24 hours (86400 seconds)

In [57]:
# get time of first impression
first_impression_time = behaviors_val.iloc[0]['Timestamp']
# remove all rows that are more than 24 hours before first impression or after first impression
behaviors_train_val = behaviors_train_val[(behaviors_train_val['Timestamp'] >= first_impression_time - TIME_WINDOW) & (behaviors_train_val['Timestamp'] < first_impression_time)]

# build news_clicks dictionary of 24 hours before first impression
for idx, row in behaviors_train_val.iterrows():
    if row['Impressions'] != '-':

        for news_id in row['Impressions'].split():
            news_id = news_id.split('-')[0]
            if news_id not in news_clicks:
                news_clicks[news_id] = deque()
            news_clicks[news_id].append(row['Timestamp'])


In [58]:
def rank_news(user_impressions):
    news_rank = []
    for news_id in user_impressions:
        if news_id in news_clicks:
            news_rank.append((news_id, len(news_clicks[news_id])))
        else:
            news_rank.append((news_id, 0))
    news_rank.sort(key=lambda x: x[1], reverse=True)
    return [news_id for news_id, _ in news_rank]

def rank_submission_format(user_impressions):
    ranked_news = rank_news(user_impressions)
    submission = []
    for news_id in user_impressions:
        submission.append(ranked_news.index(news_id) + 1)
    return submission



In [59]:
def rank_news(user_impressions):
    news_rank = []
    for news_id in user_impressions:
        if news_id in news_clicks:
            news_rank.append((news_id, len(news_clicks[news_id])))
        else:
            news_rank.append((news_id, 0))
    news_rank.sort(key=lambda x: x[1], reverse=True)
    return [news_id for news_id, _ in news_rank]

def rank_submission_format(user_impressions):
    ranked_news = rank_news(user_impressions)
    submission = []
    for news_id in user_impressions:
        submission.append(ranked_news.index(news_id) + 1)
    return submission



In [None]:
def generate_prediction_file(user_impressions, output_file="prediction.txt"):
    """
    Generates a prediction.txt file with ranked news for each impression.
    """
    with open(output_file, "w") as f:
        for impression_id, news_list in user_impressions.items():
            news_list = [news.split("-")[0] for news in news_list]
            ranked_positions = rank_submission_format(news_list)  # Generate ranking
            f.write(f"{impression_id} {json.dumps(ranked_positions)}\n")  # Format output

    print(f"✅ Prediction file '{output_file}' successfully created.")

# Generate prediction file for validation set
generate_prediction_file(behaviors_val.set_index('ImpressionId')['Impressions'].apply(lambda x: x.split()), output_file="prediction_val.txt")


✅ Prediction file 'prediction_val.txt' successfully created.


In [61]:
# Generate ground truth file for validation set
def generate_truth_file(validation_impressions, news_clicks, output_file="truth.txt"):
    """
    Generates a truth.txt file with ground truth click labels.
    """
    with open(output_file, "w") as f:
        for impression_id, news_list in validation_impressions.items():
            labels = [int(news.split("-")[1]) for news in news_list]  # Click labels
            f.write(f"{impression_id} {json.dumps(labels)}\n")  # Format output

    print(f"✅ Truth file '{output_file}' successfully created.")

generate_truth_file(behaviors_val.set_index('ImpressionId')['Impressions'].apply(lambda x: x.split()), news_clicks, output_file="truth_val.txt")


✅ Truth file 'truth_val.txt' successfully created.
