# Import necessary libraries

In [4]:
from pathlib import Path
import os
import pandas as pd
from collections import Counter, deque
import time
import json
import zipfile
from codecarbon import EmissionsTracker

# Read the data

In [5]:
# Defining column names
col_behaviors = ['ImpressionId', 'User', 'Time', 'History', 'Impressions']
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']

# Read TSV files with Pandas
behaviors_train = pd.read_csv("data/train/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_train = pd.read_csv("data/train/news.tsv", sep="\t", header=None, names=col_news)

behaviors_val = pd.read_csv("data/validation/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_val = pd.read_csv("data/validation/news.tsv", sep="\t", header=None, names=col_news)

behaviors_test = pd.read_csv("data/test/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_test = pd.read_csv("data/test/news.tsv", sep="\t", header=None, names=col_news)

# zip train and val files
behaviors_train_val = pd.concat([behaviors_train, behaviors_val])
news_train_val = pd.concat([news_train, news_val])

# Convert time column to timestamp and sort by time
behaviors_train_val['Timestamp'] = behaviors_train_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_train_val = behaviors_train_val.sort_values(by='Timestamp')

# Convert time column to timestamp and sort by time
behaviors_val['Timestamp'] = behaviors_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_val = behaviors_val.sort_values(by='Timestamp')




# Setup Carbon Emissions Tracking

In [6]:
# Initialize the emissions tracker
tracker = EmissionsTracker(project_name="news_recommendation_ctr_baseline", output_dir="emissions", log_level="critical")
# Start tracking emissions
tracker.start()

# Implement baseline model

In [7]:
# Define rolling window duration: 24 hours in seconds
TIME_WINDOW = 86400

# Dictionary to store clicked news articles with associated timestamps
news_clicks = {}

def update_news_clicks(current_time, past_clicked_articles):
    """
    Update the news_clicks dictionary by removing outdated clicks and adding new ones.
    """
    # Step 1: Remove old entries beyond the 24h time window
    for news_id in list(news_clicks.keys()):
        # Remove oldest click timestamps that fall outside the 24h window
        while news_clicks[news_id] and news_clicks[news_id][0] < current_time - TIME_WINDOW:
            news_clicks[news_id].popleft()
        # Remove the entry if no clicks remain for this news_id
        if not news_clicks[news_id]:
            del news_clicks[news_id]

    # Step 2: Add new clicks from the previous impression
    if past_clicked_articles:
        for news_id in past_clicked_articles[0]:
            if news_id not in news_clicks:
                news_clicks[news_id] = deque()
            news_clicks[news_id].append(past_clicked_articles[1])  # Append the timestamp of the click


def rank_news(user_impressions, current_time, past_clicked_articles):
    """
    Rank news articles in the current impression based on the number of clicks 
    in the last 24 hours.
    """
    # Update the global click stats with the current timestamp and past clicks
    update_news_clicks(current_time, past_clicked_articles)

    news_rank = []
    for news_id in user_impressions:
        if news_id in news_clicks:
            # Use the number of clicks in the past 24h
            news_rank.append((news_id, len(news_clicks[news_id])))
        else:
            # If no clicks, assign score of 0
            news_rank.append((news_id, 0))

    # Sort the news items by click count in descending order
    news_rank.sort(key=lambda x: x[1], reverse=True)

    # Return only the ordered list of news IDs
    return [news_id for news_id, _ in news_rank]


def rank_submission_format(user_impressions, current_time, past_clicked_articles):
    """
    Return the ranking positions for each news article in the user impression list.
    """
    ranked_news = rank_news(user_impressions, current_time, past_clicked_articles)
    return [ranked_news.index(news_id) + 1 for news_id in user_impressions]


def generate_prediction_file(behaviors_df, output_file="prediction.txt"):
    """
    Generate a prediction file with click-based news rankings for each impression.
    """
    past_clicked_articles = []  # Stores clicked articles from the previous row

    with open(output_file, "w") as f:
        for _, row in behaviors_df.iterrows():
            impression_id = row['ImpressionId']
            current_time = row['Timestamp']

            # Extract only news IDs (without click label)
            user_impressions = [news.split("-")[0] for news in row['Impressions'].split()]

            # Compute ranking positions based on past 24h click data
            ranked_positions = rank_submission_format(user_impressions, current_time, past_clicked_articles)

            # Write results to file in required format
            f.write(f"{impression_id} {json.dumps(ranked_positions)}\n")

            # Prepare click data from this row for use in the next iteration
            past_clicked_articles = (
                [news.split("-")[0] for news in row['Impressions'].split() if news.split("-")[1] == '1'],
                current_time
            )

    print(f"✅ Prediction file '{output_file}' successfully created.")


# Generate predictions for the validation set using the click-based popularity model
generate_prediction_file(behaviors_val, output_file="prediction_val_baseline.txt")


✅ Prediction file 'prediction_val_baseline.txt' successfully created.


# Carbon Emissions Report

In [8]:
# Stop tracking and get the emissions data
emissions = tracker.stop()
print(f"💡 Carbon emissions from this run: {emissions:.6f} kg CO2eq")

# Display detailed emissions information and write to txt
try:
    import pandas as pd
    from datetime import datetime
    import os

    # Load latest emissions entry
    df = pd.read_csv("emissions/emissions.csv")
    emissions_data = df.iloc[-1]

    # Diagnose available columns
    available_columns = df.columns.tolist()
    # print(f"📂 Available columns: {available_columns}")

    # Prepare values
    duration_hr = emissions_data['duration'] / 3600
    energy_kwh = emissions_data['energy_consumed']
    cpu_power = emissions_data['cpu_power']

    gpu_power = (
        f"{emissions_data['gpu_power']:.2f} W"
        if 'gpu_power' in emissions_data and not pd.isna(emissions_data['gpu_power'])
        else "Not available"
    )

    country = emissions_data['country_name'] if 'country_name' in emissions_data else "Not available"

    carbon_intensity = (
        f"{emissions_data['country_co2_eq_electricity']:.2f} gCO2eq/kWh"
        if 'country_co2_eq_electricity' in emissions_data and not pd.isna(emissions_data['country_co2_eq_electricity'])
        else "Not available"
    )

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Print to console
    print(f"\nDetailed emissions data:")
    print(f"- Duration: {duration_hr:.2f} hours")
    print(f"- Energy consumed: {energy_kwh:.4f} kWh")
    print(f"- CPU Power: {cpu_power:.2f} W")
    print(f"- GPU Power: {gpu_power}")
    print(f"- Country: {country}")

    # Create structured report text
    report = f"""\
📄 Emissions Report – {timestamp}
====================================
🌱 Total Emissions:     {emissions:.6f} kg CO2eq

🕒 Duration:            {duration_hr:.2f} hours
⚡ Energy Consumed:     {energy_kwh:.4f} kWh
🧠 CPU Power:           {cpu_power:.2f} W
🎮 GPU Power:           {gpu_power}

🌍 Country:             {country}
====================================
"""

    # Ensure output directory exists
    os.makedirs("emissions", exist_ok=True)

    # Save to .txt file
    with open("emissions/emissions_report_baseline.txt", "w") as f:
        f.write(report)

except Exception as e:
    print(f"\n❗ Could not load detailed emissions data: {str(e)}")


💡 Carbon emissions from this run: 0.000001 kg CO2eq

Detailed emissions data:
- Duration: 0.01 hours
- Energy consumed: 0.0000 kWh
- CPU Power: 5.00 W
- GPU Power: 0.00 W
- Country: Norway


  df = pd.concat([df, pd.DataFrame.from_records([dict(total.values)])])


# Generate a truth file for evaluation

In [9]:
# Generate ground truth file for validation set
def generate_truth_file(validation_impressions, output_file="truth.txt"):
    """
    Generates a truth.txt file with ground truth click labels.
    """
    with open(output_file, "w") as f:
        for impression_id, news_list in validation_impressions.items():
            labels = [int(news.split("-")[1]) for news in news_list]  # Click labels
            f.write(f"{impression_id} {json.dumps(labels)}\n")  # Format output

    print(f"✅ Truth file '{output_file}' successfully created.")

generate_truth_file(behaviors_val.set_index('ImpressionId')['Impressions'].apply(lambda x: x.split()), output_file="truth_val.txt")

✅ Truth file 'truth_val.txt' successfully created.
