In [4]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
import os
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

tqdm.pandas()

# === PATH SETUP FOR LOCAL COLAB RUNTIME ===
RAW_DIRS = [
    "/content/sample_data/raw/train",
    "/content/sample_data/raw/dev",
    "/content/sample_data/raw/test"
]
PROCESSED_DIR = "/content/sample_data/processed"
os.makedirs(PROCESSED_DIR, exist_ok=True)

# === HELPER: CLEAN TEXT ===
def clean_text(text):
    if pd.isna(text): return ""
    if str(text).startswith("http"):
        return ""  # skip URLs mistakenly passed
    text = BeautifulSoup(str(text), "html.parser").get_text()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    return text.strip()

# === LOAD AND COMBINE ARTICLES ===
print("📥 Loading all article files...")
all_articles = []

for dir_path in RAW_DIRS:
    path = os.path.join(dir_path, "news.tsv")
    print(f"🔹 Reading from: {path}")

    temp_df = pd.read_csv(path, sep='\t', header=None, on_bad_lines='skip', encoding='utf-8')
    col_count = temp_df.shape[1]

    if col_count == 6:
        temp_df.columns = ["id", "category", "subcategory", "title", "url", "entity"]
        temp_df["abstract"] = ""  # If abstract column is missing
    elif col_count == 7:
        temp_df.columns = ["id", "category", "subcategory", "title", "abstract", "url", "entity"]
    elif col_count == 8:
        temp_df.columns = ["id", "category", "subcategory", "title", "abstract", "url", "entity", "misc"]
    else:
        raise ValueError(f"❌ Unexpected number of columns: {col_count} in file {path}")

    print(f"  ✔️ Loaded {temp_df.shape[0]} rows with {col_count} columns.")
    all_articles.append(temp_df)

# Concatenate all article dataframes and remove duplicates
articles_df = pd.concat(all_articles).drop_duplicates(subset='id').reset_index(drop=True)

# === CLEAN THE ABSTRACTS ===
print("🧹 Cleaning abstracts...")
articles_df['abstract_clean'] = articles_df['abstract'].progress_apply(clean_text)
print(f"📊 Total unique articles: {articles_df.shape}")

# === LOAD AND COMBINE BEHAVIORS ===
print("📥 Loading all behavior logs...")
all_behaviors = []
for dir_path in RAW_DIRS:
    path = os.path.join(dir_path, "behaviors.tsv")
    print(f"🔹 Reading from: {path}")
    df = pd.read_csv(path, sep='\t', header=None,
                     names=["impression_id", "user_id", "timestamp", "history", "impressions"],
                     on_bad_lines='skip', encoding='utf-8')
    print(f"  ✔️ Loaded {df.shape[0]} rows.")
    all_behaviors.append(df)
behaviors_df = pd.concat(all_behaviors).reset_index(drop=True)
print(f"📊 Total behavior logs: {behaviors_df.shape}")

# === PARSE CLICKS (Fully Vectorized) ===
print("🔄 Parsing impressions to extract clicked and all articles (fully vectorized)...")

clicked_articles = []
all_articles_list = []

for impression in tqdm(behaviors_df['impressions'], desc="Parsing impressions"):
    items = str(impression).split()
    clicked = [i.split('-')[0] for i in items if i.endswith('-1')]
    all_ids = [i.split('-')[0] for i in items]
    clicked_articles.append(clicked)
    all_articles_list.append(all_ids)

behaviors_df['clicked_articles'] = clicked_articles
behaviors_df['all_articles'] = all_articles_list

# === EXPORT CLEANED DATA ===
articles_df.to_csv(os.path.join(PROCESSED_DIR, "cleaned_articles.csv"), index=False)
behaviors_df.to_csv(os.path.join(PROCESSED_DIR, "cleaned_behaviors.csv"), index=False)

print("✅ Cleaned articles and behavior logs saved to:", PROCESSED_DIR)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


📥 Loading all article files...
🔹 Reading from: /content/sample_data/raw/train/news.tsv
  ✔️ Loaded 8335 rows with 8 columns.
🔹 Reading from: /content/sample_data/raw/dev/news.tsv
  ✔️ Loaded 15019 rows with 8 columns.
🔹 Reading from: /content/sample_data/raw/test/news.tsv
  ✔️ Loaded 9710 rows with 8 columns.
🧹 Cleaning abstracts...


100%|██████████| 16893/16893 [00:01<00:00, 12974.06it/s]


📊 Total unique articles: (16893, 9)
📥 Loading all behavior logs...
🔹 Reading from: /content/sample_data/raw/train/behaviors.tsv
  ✔️ Loaded 194664 rows.
🔹 Reading from: /content/sample_data/raw/dev/behaviors.tsv
  ✔️ Loaded 18817 rows.
🔹 Reading from: /content/sample_data/raw/test/behaviors.tsv
  ✔️ Loaded 10291 rows.
📊 Total behavior logs: (223772, 5)
🔄 Parsing impressions to extract clicked and all articles (fully vectorized)...


Parsing impressions: 100%|██████████| 223772/223772 [00:04<00:00, 50206.00it/s]


✅ Cleaned articles and behavior logs saved to: /content/sample_data/processed


In [3]:
pd.read_csv("/content/sample_data/raw/train/news.tsv", sep='\t', header=None).head()


Unnamed: 0,0,1,2,3,4,5,6,7
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [5]:
# Install transformers if you haven't already
!pip install transformers

# Import necessary libraries
from transformers import BertTokenizer
import torch

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the cleaned abstracts
def tokenize_abstracts(text):
    if pd.isna(text) or text == "":
        return []
    # Tokenizing the cleaned text
    tokens = tokenizer.tokenize(text)
    return tokens

# Apply tokenization to the 'abstract_clean' column
articles_df['abstract_tokens'] = articles_df['abstract_clean'].apply(tokenize_abstracts)

# Optional: Tokenize into input IDs (for model input)
def tokenize_to_ids(text):
    if pd.isna(text) or text == "":
        return []
    # Convert tokens into token ids for BERT input
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    return input_ids

# Apply to get input IDs for each abstract
articles_df['abstract_input_ids'] = articles_df['abstract_clean'].apply(tokenize_to_ids)

# Display the first few rows to check the tokenization
print(articles_df[['id', 'abstract_clean', 'abstract_tokens', 'abstract_input_ids']].head())




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors


       id                                     abstract_clean  \
0  N88753  Shop the notebooks, jackets, and more that the...   
1  N45436  Apple's new iPad releases bring big deals on l...   
2  N23144  These seemingly harmless habits are holding yo...   
3  N86255                                                      
4  N93187  Lt. Ivan Molchanets peeked over a parapet of s...   

                                     abstract_tokens  \
0  [shop, the, notebook, ##s, ,, jackets, ,, and,...   
1  [apple, ', s, new, ipad, releases, bring, big,...   
2  [these, seemingly, harmless, habits, are, hold...   
3                                                 []   
4  [lt, ., ivan, mo, ##lch, ##ane, ##ts, peeked, ...   

                                  abstract_input_ids  
0  [101, 4497, 1996, 14960, 2015, 1010, 17764, 10...  
1  [101, 6207, 1005, 1055, 2047, 25249, 7085, 328...  
2  [101, 2122, 9428, 19741, 14243, 2024, 3173, 20...  
3                                                 []  
4  

In [7]:
!pip install transformers
!pip install torch  # if PyTorch is not already installed


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [15]:
!pip install rouge_score



Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9a9f5d8590085879423b7756a3608e48b687d92ae369d3850202e15d329a1266
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [16]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import torch
import evaluate

# Load metrics

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

# Model
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load data
cleaned_articles_df = pd.read_csv('/content/sample_data/processed/cleaned_articles.csv')
texts = cleaned_articles_df['abstract_clean'].fillna("").astype(str).tolist()

# Batched summarization
def generate_batch_summaries(texts, batch_size=16, max_input_len=512, max_output_len=100):
    summaries = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = ["summarize: " + text for text in batch_texts]
        encodings = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_input_len).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **encodings,
                max_length=max_output_len,
                min_length=25,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        summaries.extend(decoded)
    return summaries

print("⚡ Generating summaries using T5-small...")
summaries = generate_batch_summaries(texts)

# Save summaries
cleaned_articles_df['summary'] = summaries
output_path = 'summarized_articles_t5.csv'
cleaned_articles_df.to_csv(output_path, index=False)
print(f"✅ Summaries saved to {output_path}")

# Evaluation
references = [ref.strip() for ref in cleaned_articles_df['abstract_clean']]
predictions = [pred.strip() for pred in summaries]

# ROUGE
rouge_result = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
print("\n📊 ROUGE Scores:")
for k, v in rouge_result.items():
    print(f"{k}: {v.mid.fmeasure:.4f}")

# BLEU (requires list of references)
bleu_refs = [[ref.split()] for ref in references]
bleu_preds = [pred.split() for pred in predictions]
bleu_result = bleu.compute(predictions=bleu_preds, references=bleu_refs)
print(f"\n📊 BLEU Score: {bleu_result['bleu']:.4f}")

# Summary length
summary_lengths = [len(s.split()) for s in summaries]
avg_length = sum(summary_lengths) / len(summary_lengths)
print(f"\n✏️ Average Summary Length: {avg_length:.2f} words")


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

⚡ Generating summaries using T5-small...


100%|██████████| 1056/1056 [16:21<00:00,  1.08it/s]


✅ Summaries saved to summarized_articles_t5.csv


AttributeError: 'float' object has no attribute 'strip'

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>