# Import necessary libraries


In [5]:
from pathlib import Path
import os
import pandas as pd
import time
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from codecarbon import EmissionsTracker
from datetime import datetime


# Download NLTK Resources

In [6]:
# import nltk
# import ssl

# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download('stopwords')
# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('maxent_ne_chunker_tab')
# nltk.download('words')
  

# Read the data

In [7]:
# Defining column names
col_behaviors = ['ImpressionId', 'User', 'Time', 'History', 'Impressions']
col_news = ['NewsId', 'Category', 'SubCat', 'Title', 'Abstract', 'url', 'TitleEnt', 'AbstractEnt']

# Read TSV files with Pandas
behaviors_train = pd.read_csv("data/train/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_train = pd.read_csv("data/train/news.tsv", sep="\t", header=None, names=col_news)

behaviors_val = pd.read_csv("data/validation/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_val = pd.read_csv("data/validation/news.tsv", sep="\t", header=None, names=col_news)

behaviors_test = pd.read_csv("data/test/behaviors.tsv", sep="\t", header=None, names=col_behaviors)
news_test = pd.read_csv("data/test/news.tsv", sep="\t", header=None, names=col_news)

# zip train and val files
behaviors_train_val = pd.concat([behaviors_train, behaviors_val])
news_train_val = pd.concat([news_train, news_val])

# Convert time column to timestamp and sort by time
behaviors_train_val['Timestamp'] = behaviors_train_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_train_val = behaviors_train_val.sort_values(by='Timestamp')

# Convert time column to timestamp and sort by time
behaviors_val['Timestamp'] = behaviors_val['Time'].apply(lambda x: time.mktime(time.strptime(x, "%m/%d/%Y %I:%M:%S %p")))
behaviors_val = behaviors_val.sort_values(by='Timestamp')


# Step 1: Setup Carbon Emissions Tracking

In [8]:
# Initialize the emissions tracker
tracker = EmissionsTracker(project_name="news_recommendation_ctr_baseline", output_dir="emissions", log_level="critical")
# Start tracking emissions
tracker.start()

# Step 2: Preprocessing

### Step 2.1: Check for missing values and fill them

In [9]:
# Check for missing values
print(news_val.isna().sum())

# Fill missing values with empty string
news_val = news_val.fillna('')

NewsId            0
Category          0
SubCat            0
Title             0
Abstract       2021
url               0
TitleEnt          2
AbstractEnt       2
dtype: int64


### Step 2.2: Create a combined feature of title abstract and categories


In [10]:
# Combine title, abstract and categories
news_val["Combined"] = news_val["Title"] + " " + news_val["Abstract"] + " " + news_val["Category"] + " " + news_val["SubCat"]

### Step 2.3: Tokenization, stopword and punctuation removal 

In [11]:
# Define a set of English stopwords
stop_words = set(stopwords.words('english'))

# Define a function to preprocess text
def preprocess_text(text):
    # Tokenize the text into lowercase words
    tokens = word_tokenize(text.lower()) 
    # Remove punctuation from the tokens
    tokens = [word for word in tokens if word not in string.punctuation]  
    # Remove stopwords from the tokens
    tokens = [word for word in tokens if word not in stop_words]  
    return tokens

# Apply the preprocessing function to the 'Combined' column and create a new column 'ProcessedCombined'
news_val['ProcessedCombined'] = news_val['Combined'].apply(preprocess_text)

# Step 3: TF-IDF Vectorization

In [12]:
# Initialize the TF-IDF Vectorizer with specific parameters
tfidf = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.999, ngram_range=(1, 3))

# Convert the tokenized and processed text back into a single string for each row
news_val['ProcessedCombined'] = news_val['ProcessedCombined'].apply(lambda x: ' '.join(x))

# Fit the TF-IDF Vectorizer to the processed text and transform it into a sparse matrix of TF-IDF features
text_vectors = tfidf.fit_transform(news_val['ProcessedCombined'])

# Print the shape of the resulting TF-IDF matrix (number of documents, number of features)
print(text_vectors.shape)

(42416, 4724)


# Step 4: Compute similarity matrix

In [13]:
similarity_matrix = cosine_similarity(text_vectors, text_vectors)   

# Step 5: Rank news articles based on similarity

In [14]:
def rank_news_for_user(user_id, impression_news, news_ids, similarity_matrix):
    """
    Ranks news articles in an impression based on similarity to the user's previous clicked news.
    """
    # Check if the user exists in the validation dataset
    if user_id not in behaviors_val["User"].values:
        return impression_news  # Default: No history, return as is

    # Get the user's previously clicked news from their history
    history = behaviors_val[behaviors_val["User"] == user_id]["History"].values

    # If no history or invalid history type, return the impression news as is
    if len(history) == 0 or pd.isna(history[0]) or not isinstance(history[0], str):
        return impression_news

    # Split the history into individual news IDs
    clicked_news = history[0].split()
    # Get indices of clicked news in the news_ids list
    clicked_indices = [news_ids.index(nid) for nid in clicked_news if nid in news_ids]

    # Compute similarity scores for each news article in the impression
    scores = []
    for news_id in impression_news:
        if news_id not in news_ids:
            scores.append((news_id, 0))  # Default score if news is missing
            continue
        
        # Get the index of the current news article
        news_idx = news_ids.index(news_id)
        # Compute similarity scores between the current news and clicked news
        similarity_scores = similarity_matrix[news_idx, clicked_indices]
        # Calculate the average similarity score
        avg_score = np.mean(similarity_scores) if len(similarity_scores) > 0 else 0
        scores.append((news_id, avg_score))

    # Sort the news articles by their similarity scores in descending order
    scores.sort(key=lambda x: x[1], reverse=True)
    
    # Return the ranked list of news IDs
    return [news_id for news_id, _ in scores]


def rank_submission_format(user_id, impression_news, news_ids, similarity_matrix):
    """
    Formats the ranked news articles for submission.
    """
    # Rank the news articles for the user
    ranked_news = rank_news_for_user(user_id, impression_news, news_ids, similarity_matrix)
    submission = []
    # Create a list of ranks for each news article in the original impression order
    for news_id in impression_news:
        submission.append(ranked_news.index(news_id) + 1)  # Rank starts from 1
    return submission


# Step 6: Create submision file

In [15]:
def generate_prediction_file(similarity_matrix, output_file="prediction.txt"):
    """
    Generates a prediction.txt file with ranked news for each impression.
    """
    # Preprocessing: extract necessary data once
    behaviors = behaviors_val.copy()

    # Split the "Impressions" column into a list of news IDs
    behaviors["ImpressionList"] = behaviors["Impressions"].apply(lambda x: x.split())

    # Create a dictionary mapping ImpressionId to the list of news IDs
    user_impressions = behaviors.set_index('ImpressionId')['ImpressionList'].to_dict()

    # Get the list of all news IDs and create a mapping from news ID to its index
    news_ids = news_val["NewsId"].tolist()
    news_id_to_idx = {nid: idx for idx, nid in enumerate(news_ids)}

    # Create a dictionary mapping ImpressionId to user and history information
    user_history_map = behaviors.set_index("ImpressionId")[["User", "History"]].to_dict(orient="index")

    # Open the output file for writing predictions
    with open(output_file, "w") as f:
        # Iterate over each impression and its associated news list
        for impression_id, news_list in user_impressions.items():
            # Retrieve user information for the current impression
            user_info = user_history_map.get(impression_id)
            if user_info is None:
                continue  # Skip if no user information is available

            # Extract user ID and clean the news list (remove any suffix after '-')
            user_id = user_info["User"]
            cleaned_news_list = [nid.split("-")[0] for nid in news_list]

            # Rank the news articles for the user and get their positions
            ranked_positions = rank_submission_format(user_id, cleaned_news_list, news_ids, similarity_matrix)

            # Write the impression ID and ranked positions to the output file
            f.write(f"{impression_id} {json.dumps(ranked_positions)}\n")

    # Print a success message after the file is created
    print(f"✅ Prediction file '{output_file}' successfully created.")


# Step 7: Execute the code

In [16]:
generate_prediction_file(similarity_matrix, output_file="prediction_val_tf_idf.txt")

✅ Prediction file 'prediction_val_tf_idf.txt' successfully created.


# Step 8: Output carbon emission report

In [17]:
# Stop tracking and get the emissions data
emissions = tracker.stop()
print(f"💡 Carbon emissions from this run: {emissions:.6f} kg CO2eq")

# Display detailed emissions information and write to txt
try:
    # Load latest emissions entry
    df = pd.read_csv("emissions/emissions.csv")
    emissions_data = df.iloc[-1]

    # Diagnose available columns
    available_columns = df.columns.tolist()
    # print(f"📂 Available columns: {available_columns}")

    # Prepare values
    duration_hr = emissions_data['duration'] / 3600
    energy_kwh = emissions_data['energy_consumed']
    cpu_power = emissions_data['cpu_power']

    gpu_power = (
        f"{emissions_data['gpu_power']:.2f} W"
        if 'gpu_power' in emissions_data and not pd.isna(emissions_data['gpu_power'])
        else "Not available"
    )

    country = emissions_data['country_name'] if 'country_name' in emissions_data else "Not available"

    carbon_intensity = (
        f"{emissions_data['country_co2_eq_electricity']:.2f} gCO2eq/kWh"
        if 'country_co2_eq_electricity' in emissions_data and not pd.isna(emissions_data['country_co2_eq_electricity'])
        else "Not available"
    )

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Print to console
    print(f"\nDetailed emissions data:")
    print(f"- Duration: {duration_hr:.2f} hours")
    print(f"- Energy consumed: {energy_kwh:.4f} kWh")
    print(f"- CPU Power: {cpu_power:.2f} W")
    print(f"- GPU Power: {gpu_power}")
    print(f"- Country: {country}")

    # Create structured report text
    report = f"""\
📄 Emissions Report – {timestamp}
====================================
🌱 Total Emissions:     {emissions:.6f} kg CO2eq

🕒 Duration:            {duration_hr:.2f} hours
⚡ Energy Consumed:     {energy_kwh:.4f} kWh
🧠 CPU Power:           {cpu_power:.2f} W
🎮 GPU Power:           {gpu_power}

🌍 Country:             {country}
====================================
"""

    # Ensure output directory exists
    os.makedirs("emissions", exist_ok=True)

    # Save to .txt file
    with open("emissions/emissions_report_content_tf_idf.txt", "w") as f:
        f.write(report)

except Exception as e:
    print(f"\n❗ Could not load detailed emissions data: {str(e)}")

💡 Carbon emissions from this run: 0.000295 kg CO2eq

Detailed emissions data:
- Duration: 1.22 hours
- Energy consumed: 0.0098 kWh
- CPU Power: 5.00 W
- GPU Power: 0.00 W
- Country: Norway


# Step 9: Create a truth file

In [18]:
# Generate ground truth file for validation set
def generate_truth_file(impressions, output_file="truth.txt"):
    """
    Generates a truth.txt file with ground truth click labels.
    """
    with open(output_file, "w") as f:
        for impression_id, news_list in impressions.items():
            labels = [int(news.split("-")[1]) for news in news_list]  # Click labels
            f.write(f"{impression_id} {json.dumps(labels)}\n")  # Format output

    print(f"✅ Truth file '{output_file}' successfully created.")

generate_truth_file(behaviors_val.set_index('ImpressionId')['Impressions'].apply(lambda x: x.split()), output_file="truth_val_1000.txt")


✅ Truth file 'truth_val_1000.txt' successfully created.
