In [None]:
# 01_data_cleaning.ipynb - Colab-Compatible Version (No Drive, Local Runtime)

import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
import os
from tqdm import tqdm

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

tqdm.pandas()

# === PATH SETUP FOR LOCAL COLAB RUNTIME ===
RAW_DIRS = [
    "/home/data/raw/train",
    "/home/data/raw/dev",
    "/home/data/raw/test"
]
PROCESSED_DIR = "/home/data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

# === HELPER: CLEAN TEXT ===
def clean_text(text):
    if pd.isna(text): return ""
    if str(text).startswith("http"):
        return ""  # skip URLs mistakenly passed
    text = BeautifulSoup(str(text), "html.parser").get_text()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    return text.strip()

# === LOAD AND COMBINE ARTICLES ===
print("📥 Loading all article files...")
all_articles = []
for dir_path in RAW_DIRS:
    path = os.path.join(dir_path, "news.tsv")
    print(f"🔹 Reading from: {path}")
    df = pd.read_csv(path, sep='\t', header=None,
                     names=["id", "category", "subcategory", "title", "abstract", "url", "entity"],
                     on_bad_lines='skip', encoding='utf-8')
    print(f"  ✔️ Loaded {df.shape[0]} rows.")
    all_articles.append(df)
articles_df = pd.concat(all_articles).drop_duplicates(subset='id').reset_index(drop=True)
print("🧹 Cleaning abstracts...")
articles_df['abstract_clean'] = articles_df['abstract'].progress_apply(clean_text)
print("📊 Total unique articles:", articles_df.shape)

# === LOAD AND COMBINE BEHAVIORS ===
print("📥 Loading all behavior logs...")
all_behaviors = []
for dir_path in RAW_DIRS:
    path = os.path.join(dir_path, "behaviors.tsv")
    print(f"🔹 Reading from: {path}")
    df = pd.read_csv(path, sep='\t', header=None,
                     names=["impression_id", "user_id", "timestamp", "history", "impressions"],
                     on_bad_lines='skip', encoding='utf-8')
    print(f"  ✔️ Loaded {df.shape[0]} rows.")
    all_behaviors.append(df)
behaviors_df = pd.concat(all_behaviors).reset_index(drop=True)
print("📊 Total behavior logs:", behaviors_df.shape)

# === PARSE CLICKS (Fully Vectorized) ===
print("🔄 Parsing impressions to extract clicked and all articles (fully vectorized)...")

clicked_articles = []
all_articles_list = []

for impression in tqdm(behaviors_df['impressions'], desc="Parsing impressions"):
    items = str(impression).split()
    clicked = [i.split('-')[0] for i in items if i.endswith('-1')]
    all_ids = [i.split('-')[0] for i in items]
    clicked_articles.append(clicked)
    all_articles_list.append(all_ids)

behaviors_df['clicked_articles'] = clicked_articles
behaviors_df['all_articles'] = all_articles_list

# === EXPORT CLEANED DATA ===
articles_df.to_csv(os.path.join(PROCESSED_DIR, "cleaned_articles.csv"), index=False)
behaviors_df.to_csv(os.path.join(PROCESSED_DIR, "cleaned_behaviors.csv"), index=False)

print("✅ Cleaned articles and behavior logs saved to:", PROCESSED_DIR)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


📥 Loading all article files...
🔹 Reading from: /home/data/raw/train/news.tsv
  ✔️ Loaded 38189 rows.
🔹 Reading from: /home/data/raw/dev/news.tsv
  ✔️ Loaded 67583 rows.
🔹 Reading from: /home/data/raw/test/news.tsv
  ✔️ Loaded 92132 rows.
🧹 Cleaning abstracts...


100%|██████████| 17/17 [00:00<00:00, 28016.96it/s]

📊 Total unique articles: (17, 8)
📥 Loading all behavior logs...
🔹 Reading from: /home/data/raw/train/behaviors.tsv





  ✔️ Loaded 47954 rows.
🔹 Reading from: /home/data/raw/dev/behaviors.tsv
  ✔️ Loaded 1747 rows.
🔹 Reading from: /home/data/raw/test/behaviors.tsv
  ✔️ Loaded 110667 rows.
📊 Total behavior logs: (160368, 5)
🔄 Parsing impressions to extract clicked and all articles (fully vectorized)...


Parsing impressions: 100%|██████████| 160368/160368 [00:03<00:00, 48865.47it/s]


✅ Cleaned articles and behavior logs saved to: /home/data/processed


In [None]:


# Import required libraries
import pandas as pd
import os
from transformers import BertTokenizer
from tqdm import tqdm

# Set up directories (if not already set)
RAW_DIRS = [
    "/home/data/raw/train",
    "/home/data/raw/dev",
    "/home/data/raw/test"
]
PROCESSED_DIR = "/home/data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

# === Load Cleaned Data ===
# Load the cleaned articles and behaviors
articles_df = pd.read_csv(os.path.join(PROCESSED_DIR, "cleaned_articles.csv"))
behaviors_df = pd.read_csv(os.path.join(PROCESSED_DIR, "cleaned_behaviors.csv"))

print("📊 Loaded data:", articles_df.shape, behaviors_df.shape)

# === Tokenizer Setup ===
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Using BERT base uncased tokenizer

# Tokenization function with NaN handling
def tokenize_article(text):
    if pd.isna(text) or text.strip() == "":
        return []  # Return empty list for NaN or empty strings
    return tokenizer.encode(text, truncation=True, padding='max_length', max_length=512)

# Apply tokenization to the article abstracts
articles_df['tokenized_abstract'] = articles_df['abstract_clean'].apply(lambda x: tokenize_article(str(x)))

# === Map User Histories to Tokenized Articles ===
# Create a mapping of article ID to tokenized article
article_mapping = dict(zip(articles_df['id'], articles_df['tokenized_abstract']))

# Function to map user article history to tokenized articles
def get_article_tokens_for_user(article_ids):
    return [article_mapping.get(article_id, []) for article_id in article_ids]

# Apply the mapping for clicked and all articles
behaviors_df['clicked_articles_tokens'] = behaviors_df['clicked_articles'].apply(lambda x: get_article_tokens_for_user(eval(x)))
behaviors_df['all_articles_tokens'] = behaviors_df['all_articles'].apply(lambda x: get_article_tokens_for_user(eval(x)))

# === Save Processed Data ===
# Save the processed articles and behaviors
articles_df.to_csv(os.path.join(PROCESSED_DIR, "tokenized_articles.csv"), index=False)
behaviors_df.to_csv(os.path.join(PROCESSED_DIR, "processed_behaviors_with_tokens.csv"), index=False)

print("✅ Tokenized articles and behavior logs saved to:", PROCESSED_DIR)


  behaviors_df = pd.read_csv(os.path.join(PROCESSED_DIR, "cleaned_behaviors.csv"))


📊 Loaded data: (17, 8) (160368, 7)
✅ Tokenized articles and behavior logs saved to: /home/data/processed


In [None]:
import pandas as pd
import ast

# Step 1: Load and clean the articles DataFrame (replace the path with the actual file path)
cleaned_articles_df = pd.read_csv('/home/data/processed/cleaned_articles.csv')  # Replace with actual path

# Check for NaN values in the 'abstract_clean' column and replace them with an empty string
cleaned_articles_df['abstract_clean'] = cleaned_articles_df['abstract_clean'].fillna('')

# Tokenize the 'abstract_clean' column
def tokenize_article(text):
    return text.split()

# Apply tokenization to the cleaned articles
cleaned_articles_df['tokenized_abstract'] = cleaned_articles_df['abstract_clean'].apply(tokenize_article)

# Save the tokenized articles to CSV (replace with actual path)
cleaned_articles_df.to_csv('/home/data/processed/tokenized_articles.csv', index=False)

print("✅ Tokenization complete. Tokenized articles saved.")

# Step 2: Load and clean the user behavior DataFrame
cleaned_behaviours_df = pd.read_csv('/home/data/processed/cleaned_behaviors.csv')  # Replace with actual path

# Step 3: Clean and convert clicked_articles and history into lists
# Convert 'clicked_articles' from string representation of a list into an actual list
cleaned_behaviours_df['clicked_articles'] = cleaned_behaviours_df['clicked_articles'].apply(ast.literal_eval)

# Handle NaN values in 'history' column and convert to list
cleaned_behaviours_df['history'] = cleaned_behaviours_df['history'].fillna('')

# Convert 'history' from space-separated string of article IDs into a list
cleaned_behaviours_df['history'] = cleaned_behaviours_df['history'].apply(lambda x: x.split() if isinstance(x, str) else [])

# Preview the cleaned user histories
print("✅ Cleaned User Histories Sample:")
print(cleaned_behaviours_df[['user_id', 'clicked_articles', 'history']].head())

# Step 4: Extract User Histories and create a sample
user_histories = cleaned_behaviours_df[['user_id', 'clicked_articles', 'history']]

print("✅ User Histories Sample:")
print(user_histories.head())


✅ Tokenization complete. Tokenized articles saved.


  cleaned_behaviours_df = pd.read_csv('/home/data/processed/cleaned_behaviors.csv')  # Replace with actual path


✅ Cleaned User Histories Sample:
   user_id                  clicked_articles  \
0   U87243  [N94157, N78699, N71090, N31174]   
1  U598644                  [N25587, N36266]   
2  U532401                          [N47925]   
3  U593596                         [N114935]   
4  U239687                          [N86258]   

                                             history  
0  [N8668, N39081, N65259, N79529, N73408, N43615...  
1  [N56056, N8726, N70353, N67998, N83823, N11110...  
2  [N128643, N87446, N122948, N9375, N82348, N129...  
3  [N31043, N39592, N4104, N8223, N114581, N92747...  
4  [N65250, N122359, N71723, N53796, N41663, N414...  
✅ User Histories Sample:
   user_id                  clicked_articles  \
0   U87243  [N94157, N78699, N71090, N31174]   
1  U598644                  [N25587, N36266]   
2  U532401                          [N47925]   
3  U593596                         [N114935]   
4  U239687                          [N86258]   

                                  

In [None]:
import pandas as pd
import ast
import re
import os

# Paths
articles_path = '/home/data/processed/cleaned_articles.csv'
behaviors_path = '/home/data/processed/cleaned_behaviors.csv'
output_path = '/home/data/processed/cleaned_user_histories.csv'

# --- Safe tokenizer (no nltk) ---
def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

# Load cleaned articles
cleaned_articles_df = pd.read_csv(articles_path)

if 'abstract_clean' in cleaned_articles_df.columns:
    cleaned_articles_df['abstract_clean'] = cleaned_articles_df['abstract_clean'].fillna('').astype(str)
    cleaned_articles_df['abstract_tokens'] = cleaned_articles_df['abstract_clean'].apply(simple_tokenizer)
    print("✅ Tokenization complete using regex. Tokenized articles saved.")
else:
    print("⚠️ 'abstract_clean' column not found in article data.")

# Load cleaned behaviors
cleaned_behaviours_df = pd.read_csv(behaviors_path, low_memory=False)

# Convert strings to lists
if 'clicked_articles' in cleaned_behaviours_df.columns:
    cleaned_behaviours_df['clicked_articles'] = cleaned_behaviours_df['clicked_articles'].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else []
    )

if 'history' in cleaned_behaviours_df.columns:
    cleaned_behaviours_df['history'] = cleaned_behaviours_df['history'].fillna('').astype(str)
    cleaned_behaviours_df['history'] = cleaned_behaviours_df['history'].apply(lambda x: x.split())

# Preview
print("✅ Cleaned User Histories Sample:")
print(cleaned_behaviours_df[['user_id', 'clicked_articles', 'history']].head())

# Save output
cleaned_behaviours_df[['user_id', 'clicked_articles', 'history']].to_csv(output_path, index=False)
print(f"✅ Cleaned user histories saved to: {output_path}")


✅ Tokenization complete using regex. Tokenized articles saved.
✅ Cleaned User Histories Sample:
   user_id                  clicked_articles  \
0   U87243  [N94157, N78699, N71090, N31174]   
1  U598644                  [N25587, N36266]   
2  U532401                          [N47925]   
3  U593596                         [N114935]   
4  U239687                          [N86258]   

                                             history  
0  [N8668, N39081, N65259, N79529, N73408, N43615...  
1  [N56056, N8726, N70353, N67998, N83823, N11110...  
2  [N128643, N87446, N122948, N9375, N82348, N129...  
3  [N31043, N39592, N4104, N8223, N114581, N92747...  
4  [N65250, N122359, N71723, N53796, N41663, N414...  
✅ Cleaned user histories saved to: /home/data/processed/cleaned_user_histories.csv


In [None]:
from google.colab import files
files.download('/home/data/processed/cleaned_behaviors.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install evaluate
!pip install transformers evaluate



Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [None]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm

# Load the cleaned articles CSV (make sure this file is uploaded)
cleaned_articles_df = pd.read_csv('/content/sample_data/processed/cleaned_articles.csv')

# Load BART tokenizer and model
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Define summarization function
def generate_summary(text, model, tokenizer, max_input_length=1024, max_output_length=100):
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return ""

    inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_input_length, truncation=True)
    summary_ids = model.generate(
        inputs,
        max_length=max_output_length,
        min_length=25,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Apply summarization to each article's abstract_clean
summaries = []
print("⚙️ Generating summaries using BART...")
for text in tqdm(cleaned_articles_df['abstract_clean']):
    summary = generate_summary(text, model, tokenizer)
    summaries.append(summary)

# Add new column to DataFrame
cleaned_articles_df['summary'] = summaries

# Save to a new CSV file
output_path = 'summarized_articles_bart.csv'
cleaned_articles_df.to_csv(output_path, index=False)

print(f"✅ All summaries saved to {output_path}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

⚙️ Generating summaries using BART...


100%|██████████| 17/17 [00:00<00:00, 63550.06it/s]

✅ All summaries saved to summarized_articles_bart.csv





In [None]:
import pandas as pd

df = pd.read_csv("/content/sample_data/processed/cleaned_articles.csv")
print(df.columns)
print(df[['abstract', 'abstract_clean']].head())


Index(['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entity',
       'abstract_clean'],
      dtype='object')
                                        abstract  abstract_clean
0  https://assets.msn.com/labs/mind/AAGH0ET.html             NaN
1  https://assets.msn.com/labs/mind/AABmf2I.html             NaN
2  https://assets.msn.com/labs/mind/AAB19MK.html             NaN
3  https://assets.msn.com/labs/mind/AAJ4lap.html             NaN
4  https://assets.msn.com/labs/mind/AAJwoxD.html             NaN
