# Import necessary libraries

In [29]:
from pathlib import Path
import pandas as pd
from collections import Counter, deque
import time
import json
from collections import deque, defaultdict
from codecarbon import EmissionsTracker
from datetime import datetime
import os



# Read the data

In [30]:
# Defining column names
col_behaviors = ['ImpressionId', 'User', 'Time', 'History', 'Impressions']
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']

# Read TSV files with Pandas
behaviors_train = pd.read_csv("data/train/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_train = pd.read_csv("data/train/news.tsv", sep="\t", header=None, names=col_news)

behaviors_val = pd.read_csv("data/validation/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_val = pd.read_csv("data/validation/news.tsv", sep="\t", header=None, names=col_news)

behaviors_test = pd.read_csv("data/test/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_test = pd.read_csv("data/test/news.tsv", sep="\t", header=None, names=col_news)

# zip train and val files
behaviors_train_val = pd.concat([behaviors_train, behaviors_val])
news_train_val = pd.concat([news_train, news_val])

# Convert time column to timestamp and sort by time
behaviors_train_val['Timestamp'] = behaviors_train_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_train_val = behaviors_train_val.sort_values(by='Timestamp')

# Convert time column to timestamp and sort by time
behaviors_val['Timestamp'] = behaviors_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_val = behaviors_val.sort_values(by='Timestamp')

# Setup Carbon Emissions Tracking

In [31]:
# Initialize the emissions tracker
tracker = EmissionsTracker(project_name="news_recommendation_ctr_baseline", output_dir="emissions", log_level="critical")
# Start tracking emissions
tracker.start()

# Implement baseline model

In [32]:
# Define rolling window duration: testing with different time windows
# TIME_WINDOW = 24 * 60 * 60  # 24 hours in seconds
# TIME_WINDOW = 48 * 60 * 60  # 48 hours in seconds
# TIME_WINDOW = 72 * 60 * 60  # 72 hours in seconds
# TIME_WINDOW = 12 * 60 * 60  # 12 hours in seconds
TIME_WINDOW = 7 * 24 * 60 * 60  # 7 days in seconds

# Initialize dictionary to store per-news statistics (clicks and impressions with timestamps)
news_stats = defaultdict(lambda: {'clicks': deque(), 'impressions': deque()})

# Get timestamp of the first impression in the validation set
first_impression_time = behaviors_val.iloc[0]['Timestamp']

# Filter training/validation data: only keep interactions within the 24h window before validation starts
behaviors_train_val = behaviors_train_val[
    (behaviors_train_val['Timestamp'] >= first_impression_time - TIME_WINDOW) &
    (behaviors_train_val['Timestamp'] < first_impression_time)
]

# Populate the initial news_stats dictionary with clicks and impressions
for _, row in behaviors_train_val.iterrows():
    if row['Impressions'] != '-':
        for news in row['Impressions'].split():
            news_id, label = news.split('-')  # Separate news ID and click label
            news_stats[news_id]['impressions'].append(row['Timestamp'])  # Store impression timestamp
            if label == '1':  # If clicked, also store in click stats
                news_stats[news_id]['clicks'].append(row['Timestamp'])


def update_news_stats(current_time, past_clicked_articles, past_impressed_articles):
    """
    Maintain the rolling 24h window:
    - Remove outdated clicks/impressions
    - Add recent ones from the previous impression
    """
    for news_id in list(news_stats.keys()):
        # Remove outdated clicks
        while news_stats[news_id]['clicks'] and news_stats[news_id]['clicks'][0] < current_time - TIME_WINDOW:
            news_stats[news_id]['clicks'].popleft()
        # Remove outdated impressions
        while news_stats[news_id]['impressions'] and news_stats[news_id]['impressions'][0] < current_time - TIME_WINDOW:
            news_stats[news_id]['impressions'].popleft()
        # Remove news entry if both lists are empty
        if not news_stats[news_id]['clicks'] and not news_stats[news_id]['impressions']:
            del news_stats[news_id]

    if past_impressed_articles:
        # Unpack impression articles and timestamp
        impression_list, timestamp = past_impressed_articles

        # Add current impressions
        for news_id in impression_list:
            news_stats[news_id]['impressions'].append(timestamp)

        # Add current clicks
        for news_id in past_clicked_articles:
            news_stats[news_id]['clicks'].append(timestamp)


def rank_news(user_impressions, current_time, past_clicked_articles, past_impressed_articles):
    """
    Rank news articles by Click-Through Rate (CTR) over the past 24 hours.
    CTR = clicks / impressions
    """
    # Update rolling stats with recent data
    update_news_stats(current_time, past_clicked_articles, past_impressed_articles)

    news_rank = []
    for news_id in user_impressions:
        stats = news_stats.get(news_id, {'clicks': deque(), 'impressions': deque()})
        impressions = len(stats['impressions'])
        clicks = len(stats['clicks'])
        ctr = clicks / impressions if impressions > 0 else 0.0  # Avoid division by zero
        news_rank.append((news_id, ctr))

    # Sort news descending by CTR
    news_rank.sort(key=lambda x: x[1], reverse=True)
    return [news_id for news_id, _ in news_rank]


def rank_submission_format(user_impressions, current_time, past_clicked_articles, past_impressed_articles):
    """
    Return list of rank positions for each news article in the original impression list.
    """
    ranked_news = rank_news(user_impressions, current_time, past_clicked_articles, past_impressed_articles)
    return [ranked_news.index(news_id) + 1 for news_id in user_impressions]


def generate_prediction_file(behaviors_df, output_file="prediction.txt"):
    """
    Generate a prediction file with CTR-based news rankings for each impression event.
    """
    past_clicked_articles = []        # List of clicked articles from previous row
    past_impressed_articles = []      # Tuple: (list of all impressions, timestamp)

    with open(output_file, "w") as f:
        for _, row in behaviors_df.iterrows():
            impression_id = row['ImpressionId']
            current_time = row['Timestamp']
            impression_entries = row['Impressions'].split()
            user_impressions = [news.split("-")[0] for news in impression_entries]

            # Generate rank positions based on CTR
            ranked_positions = rank_submission_format(user_impressions, current_time, past_clicked_articles, past_impressed_articles)

            # Write prediction result to file in required format
            f.write(f"{impression_id} {json.dumps(ranked_positions)}\n")

            # Extract clicked and all impression articles from current row
            past_clicked_articles = [news.split("-")[0] for news in impression_entries if news.split("-")[1] == '1']
            past_impressed_articles = (user_impressions, current_time)

    print(f"✅ Prediction file '{output_file}' successfully created.")


# Run the CTR-based prediction file generation
generate_prediction_file(behaviors_val, output_file="prediction_val_baseline_ctr_1w.txt")

✅ Prediction file 'prediction_val_baseline_ctr_1w.txt' successfully created.


# Carbon Emissions Report

In [33]:
# Stop tracking and get the emissions data
emissions = tracker.stop()
print(f"💡 Carbon emissions from this run: {emissions:.6f} kg CO2eq")

# Display detailed emissions information and write to txt
try:
    # Load latest emissions entry
    df = pd.read_csv("emissions/emissions.csv")
    emissions_data = df.iloc[-1]

    # Diagnose available columns
    available_columns = df.columns.tolist()
    # print(f"📂 Available columns: {available_columns}")

    # Prepare values
    duration_hr = emissions_data['duration'] / 3600
    energy_kwh = emissions_data['energy_consumed']
    cpu_power = emissions_data['cpu_power']

    gpu_power = (
        f"{emissions_data['gpu_power']:.2f} W"
        if 'gpu_power' in emissions_data and not pd.isna(emissions_data['gpu_power'])
        else "Not available"
    )

    country = emissions_data['country_name'] if 'country_name' in emissions_data else "Not available"

    carbon_intensity = (
        f"{emissions_data['country_co2_eq_electricity']:.2f} gCO2eq/kWh"
        if 'country_co2_eq_electricity' in emissions_data and not pd.isna(emissions_data['country_co2_eq_electricity'])
        else "Not available"
    )

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Print to console
    print(f"\nDetailed emissions data:")
    print(f"- Duration: {duration_hr:.2f} hours")
    print(f"- Energy consumed: {energy_kwh:.4f} kWh")
    print(f"- CPU Power: {cpu_power:.2f} W")
    print(f"- GPU Power: {gpu_power}")
    print(f"- Country: {country}")

    # Create structured report text
    report = f"""\
📄 Emissions Report – {timestamp}
====================================
🌱 Total Emissions:     {emissions:.6f} kg CO2eq

🕒 Duration:            {duration_hr:.2f} hours
⚡ Energy Consumed:     {energy_kwh:.4f} kWh
🧠 CPU Power:           {cpu_power:.2f} W
🎮 GPU Power:           {gpu_power}

🌍 Country:             {country}
====================================
"""

    # Ensure output directory exists
    os.makedirs("emissions", exist_ok=True)

    # Save to .txt file
    with open("emissions/emissions_report_baseline_ctr_1w.txt", "w") as f:
        f.write(report)

except Exception as e:
    print(f"\n❗ Could not load detailed emissions data: {str(e)}")


💡 Carbon emissions from this run: 0.000062 kg CO2eq

Detailed emissions data:
- Duration: 0.26 hours
- Energy consumed: 0.0021 kWh
- CPU Power: 5.00 W
- GPU Power: 0.00 W
- Country: Norway


# Generate a truth file for evaluation

In [34]:
# Generate ground truth file for validation set
def generate_truth_file(validation_impressions, output_file="truth.txt"):
    """
    Generates a truth.txt file with ground truth click labels.
    """
    with open(output_file, "w") as f:
        for impression_id, news_list in validation_impressions.items():
            labels = [int(news.split("-")[1]) for news in news_list]  # Click labels
            f.write(f"{impression_id} {json.dumps(labels)}\n")  # Format output

    print(f"✅ Truth file '{output_file}' successfully created.")

generate_truth_file(behaviors_val.set_index('ImpressionId')['Impressions'].apply(lambda x: x.split()), output_file="truth_val.txt")

✅ Truth file 'truth_val.txt' successfully created.
