# Final Project Codebook - Summarization

Aditya Kumar, Matthew Shull and Irina Lee

##Importing Libraries




In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import torch

# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

#Import for Llama Model
from huggingface_hub import login
login("INSERT KEY HERE")


Mounted at /content/drive


# Data Import and Visualizing

In [None]:
#Import CSV Files
file_path_train = '/content/drive/My Drive/W266_Final Project/train.csv'
train_df = pd.read_csv(file_path_train)

In [None]:
# Display columns
print("Columns in train_df:", train_df.columns)
print()

# Check for missing data
print("Missing values:\n", train_df.isnull().sum())

Columns in train_df: Index(['id', 'article', 'highlights'], dtype='object')

Missing values:
 id            0
article       0
highlights    0
dtype: int64


In [None]:
print("Train Size: ", train_df.size)
print("Train Shape: ", train_df.shape)

Train Size:  861339
Train Shape:  (287113, 3)


# Creating batches of random 500 articles from the main dataset

In [None]:
# Step 1: Shuffle the DataFrame
shuffled_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 2: Ensure the DataFrame has exactly 2000 rows for 4 batches of 500 (discard leftovers if necessary)
desired_rows = 2000  # 4 batches * 500 rows
shuffled_df = shuffled_df.iloc[:desired_rows]

# Step 3: Split into 4 batches of 500 rows each (we know the size will be exactly 500 per batch)
batches = [shuffled_df.iloc[i:i + 500] for i in range(0, 2000, 500)]  # Manually split in exact chunks

# Step 4: Access individual batches
batch_1 = batches[0]
batch_2 = batches[1]
batch_3 = batches[2]
batch_4 = batches[3]

# Verify the sizes
print(f"Batch 1 size: {batch_1.shape[0]}")
print(f"Batch 2 size: {batch_2.shape[0]}")
print(f"Batch 3 size: {batch_3.shape[0]}")
print(f"Batch 4 size: {batch_4.shape[0]}")

Batch 1 size: 500
Batch 2 size: 500
Batch 3 size: 500
Batch 4 size: 500


In [None]:
import re
def clean_text(text):
    #text = text.lower()  # Convert to lowercase
    return re.sub(r"[^\x00-\x7F]+|\s+", " ", str(text))
    return text

In [None]:
clean_batches = []  # List to store cleaned batches
columns_to_drop = ['id', 'article', 'highlights']  # Columns to drop

for i, batch in enumerate(batches, 1):  # Enumerate for batch tracking
    print(f"Processing Batch {i}...")  # Optional: Progress log

     # Create a copy of the batch to avoid SettingWithCopyWarning
    batch = batch.copy()

    # Check if required columns exist
    if 'article' in batch.columns and 'highlights' in batch.columns:
        # Apply cleaning to 'article' and 'highlights' columns
        batch.loc[:, 'article_cleaned'] = batch['article'].apply(clean_text)
        batch.loc[:, 'highlights_cleaned'] = batch['highlights'].apply(clean_text)
    else:
        raise KeyError("Required columns 'article' and 'highlights' are missing from the DataFrame.")

    # Drop unnecessary columns
    batch.drop(columns=columns_to_drop, inplace=True, errors="ignore")

    # Store the cleaned batch
    clean_batches.append(batch)

# Step 4: Unpack cleaned batches into variables
clean_batch_1, clean_batch_2, clean_batch_3, clean_batch_4 = clean_batches

# Confirmation message
print("All batches have been cleaned and stored successfully.")

Processing Batch 1...
Processing Batch 2...
Processing Batch 3...
Processing Batch 4...
All batches have been cleaned and stored successfully.


In [None]:
clean_batch_2.head()

Unnamed: 0,article_cleaned,highlights_cleaned
500,By . Lizzie Parry for MailOnline . Experts hav...,Trading laws from 20th century limited hours s...
501,"While not everything that glitters is gold, th...",Deborah Lippmann unveils Gold Digger polish . ...
502,Demand is once again outweighing supply at App...,Apple has broken its record set in 2012 by sel...
503,Arsenal forward Theo Walcott has expressed del...,Arsenal beat Hull City 2-0 in their FA Cup cla...
504,"By . Talal Musa . PUBLISHED: . 12:39 EST, 22 M...","Crisp, clear menus and packed full of game mod..."


In [None]:
clean_batch_4.shape

(500, 2)

# LLama 3.1 Model

In [None]:
# Define quantization configuration for efficiency
!pip install -q -U bitsandbytes flash_attn
from transformers import BitsAndBytesConfig
import torch
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.7/2.7 MB[0m [31m133.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone


In [None]:
from huggingface_hub import model_info, login
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
!pip install -U bitsandbytes
# Log into Hugging Face Hub
login(token="INSERT KEY HERE")
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16, "quantization_config": quantization_config},
    device_map="auto",
)
# Define the summarization function
def summarize_article(article_text):
  messages = [
        {
            "role": "system",
            "content": (
                "You are an expert on summarizing. "
                "Please summarize the following content in 4-5 lines. "
                "Please provide a concise summary."
            ),
        },
        {"role": "user", "content": article_text},
    ]
    # Create the prompt
  prompt = pipeline.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True)
    # Define termination tokens
  terminators = [
    pipeline.tokenizer.eos_token_id,
    #pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    # Generate the summary
  outputs = pipeline(
        prompt,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    # Extract and return the generated text
  summary = outputs[0]["generated_text"][len(prompt):]
  return summary.strip()

print("LLama model has been loaded")

LLama model has been loaded


## Function to generate summaries and count for each batch

In [None]:
def generate_summaries(dataframe, text_column, summary_column):
    """
    Generates summaries for a given column in a DataFrame using the LLaMA model.
    Args:
        dataframe (pd.DataFrame): The DataFrame containing the text to summarize.
        text_column (str): The name of the column containing text to summarize.
        summary_column (str): The name of the column to store generated summaries.
    Returns:
        pd.DataFrame: DataFrame with a new column containing the summaries.
    """
    summaries = []  # List to store summaries
    counter = 0  # Counter for progress tracking
    # Iterate through the DataFrame rows
    for idx, row in dataframe.iterrows():
        counter += 1
        print(f"Processing summary {counter}/{len(dataframe)}...")  # Log progress
        try:
            # Generate the summary for the current row
            summary = summarize_article(row[text_column])
        except Exception as e:
            print(f"Error at row {idx}: {e}")
            summary = "Error generating summary"  # Placeholder for errors
        # Append the generated summary
        summaries.append(summary)
    # Add the summaries as a new column to the DataFrame
    dataframe[summary_column] = summaries
    print(f"\nSummarization completed. Added '{summary_column}' column.")
    return dataframe

# Batch 1 Summaries

In [None]:
clean_batch_1 = generate_summaries(clean_batch_1, "article_cleaned", "llama_summary")
print("LLaMA summaries have been generated for Batch 1.")

#Saving CSV
from google.colab import drive
drive.mount('/content/drive')
clean_batch_1.to_csv("/content/drive/My Drive/W266_Final Project/clean_batch_1_output.csv", index=False)
print("Summaries for Batch 1 are generated and CSV is exported")

Summaries for Batch 1 are generated and CSV is exported


# Batch 2 Summaries




In [None]:
clean_batch_2 = generate_summaries(clean_batch_2, "article_cleaned", "llama_summary")
print("LLaMA summaries have been generated for Batch 2.")

#Saving CSV
from google.colab import drive
drive.mount('/content/drive')
clean_batch_2.to_csv("/content/drive/My Drive/W266_Final Project/clean_batch_2_output.csv", index=False)
print("Summaries for Batch 2 are generated and CSV is exported")

Summaries for Batch 2 are generated and CSV is exported


# Batch 3 Summaries


In [None]:
clean_batch_3 = generate_summaries(clean_batch_3, "article_cleaned", "llama_summary")
print("LLaMA summaries have been generated for Batch 3.")

#Saving CSV
from google.colab import drive
drive.mount('/content/drive')
clean_batch_3.to_csv("/content/drive/My Drive/W266_Final Project/clean_batch_3_output.csv", index=False)
print("Summaries for Batch 3 are generated and CSV is exported")

Summaries for Batch 3 are generated and CSV is exported


# Batch 4 Summaries

In [None]:
clean_batch_4 = generate_summaries(clean_batch_4, "article_cleaned", "llama_summary")
print("LLaMA summaries have been generated for Batch 4.")

#Saving CSV
from google.colab import drive
drive.mount('/content/drive')
clean_batch_4.to_csv("/content/drive/My Drive/W266_Final Project/clean_batch_4_output.csv", index=False)
print("Summaries for Batch 4 are generated and CSV is exported")

Summaries for Batch 4 are generated and CSV is exported


**END OF SUMMARIZATION NOTEBOOK**