# Data Science HW4
### Report: https://hackmd.io/@ohmygod0193/HJgdf3k7R
torch.nn tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

In [3]:
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # change device number if there exists more than one gpu on your platform.

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [6]:
from transformers import AutoTokenizer

checkpoint = "t5small_TextSummarization/" # released full model path
TK_ckpt = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(TK_ckpt)  # use tokeniozer from Hugging Face

In [7]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [9]:
import evaluate

rouge = evaluate.load("rouge")

In [10]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [19]:
from datasets import load_dataset

billsum = (load_dataset('billsum', split='train').train_test_split(test_size=0.2))
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map: 100%|██████████| 15159/15159 [00:06<00:00, 2367.51 examples/s]
Map: 100%|██████████| 3790/3790 [00:01<00:00, 2118.37 examples/s]


## Ratio of non-zero parameter 

In [12]:
def show_param_ratio(model):
    num_param = 0
    for param in model.parameters():
        num_param += param.numel()
    num_mask = 0
    for name, param in model.named_buffers():
        if "mask" in name:
            num_mask += int((param == 0).sum())
    print(num_param/1e6,'M')
    print((num_param - num_mask) / num_param)

In [14]:
import torch.nn.utils.prune as prune
from transformers.models.t5.modeling_t5 import T5LayerSelfAttention, T5LayerCrossAttention, T5LayerFF

def puring(pruned_model,parameters_to_prune,amounts_to_prune):
    for name, module in pruned_model.named_modules():
        part = name.split('.')[0]+'_'
        if isinstance(module, T5LayerSelfAttention):
            for name2, layer in module.named_modules():
                if isinstance(layer, torch.nn.Linear):
                    parameters_to_prune[part+'T5LayerSelfAttention'].append((layer, 'weight'))
        elif isinstance(module, T5LayerCrossAttention):
            for name2, layer in module.named_modules():
                if isinstance(layer, torch.nn.Linear):
                    parameters_to_prune[part+'T5LayerCrossAttention'].append((layer, 'weight'))
        elif isinstance(module, T5LayerFF):
            for name2, layer in module.named_modules():
                if isinstance(layer, torch.nn.Linear):
                    parameters_to_prune[part+'T5LayerFF'].append((layer, 'weight'))
        elif isinstance(module, torch.nn.Linear) and name=="lm_head":
            parameters_to_prune['lm_head'].append((module, 'weight'))
    for name,amount in zip(parameters_to_prune,amounts_to_prune):
        prune.global_unstructured(
            parameters_to_prune[name],
            pruning_method=prune.L1Unstructured,
            amount=amount,
        )
    show_param_ratio(pruned_model)
    return pruned_model

In [17]:
def training_pruned_model(pruned_model,output_dir,tokenized_billsum,tokenizer,data_collator,compute_metrics,epochs):
    pruned_model.to(device)
    pruned_training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="steps",
        learning_rate=3e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=epochs,
        lr_scheduler_type="linear",
        seed=42,
        fp16=True,
        logging_steps=10000,
        predict_with_generate=True,
        #metric_for_best_model="rougel",
        #load_best_model_at_end=True,
        save_steps=10000,
        #do_eval=False,
        eval_steps=10000  # Add this line to evaluate every 5 epochs
    )
    pruned_trainer = Seq2SeqTrainer(
        model=pruned_model,
        args=pruned_training_args,
        train_dataset=tokenized_billsum['train'],
        eval_dataset=tokenized_billsum["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    pruned_trainer.train()
    return pruned_model, pruned_trainer

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
def load_pruned_model(model_name):
    pruned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    import torch
    import torch.nn.utils.prune as prune
    import torch.nn.utils.prune as prune
    from transformers.models.t5.modeling_t5 import T5LayerSelfAttention, T5LayerCrossAttention, T5LayerFF
    # Apply prune.identity to the layers that were pruned
    for module in pruned_model.modules():
        if isinstance(module, torch.nn.Linear):  # Check the layer type as per your model's pruned layers
            prune.identity(module, 'weight')
    pruned_model.load_state_dict(torch.load(model_name+'/model_state_dict.pth'))
    return pruned_model

In [25]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
show_param_ratio(model)

In [26]:
for i in range(1,6):
    parameters_to_prune = {
        'encoder_T5LayerSelfAttention':[], 
        'encoder_T5LayerFF':[],
        'decoder_T5LayerSelfAttention':[], 
        'decoder_T5LayerCrossAttention':[],
        'decoder_T5LayerFF':[],
        'lm_head':[],
    }
    amounts_to_prune = [
        0.1,
        0.1,
        0.3,
        0.2,
        0.3,
        0.3,
    ]
    pruned_model = puring(model,parameters_to_prune,amounts_to_prune)
    model_name = 'pruned_model_V'+str(i)
    print("Pruning pruned_model_V"+str(i))
    pruned_model, pruned_trainer = training_pruned_model(pruned_model,'pruned_billsum_model_V'+str(i),tokenized_billsum,tokenizer,data_collator,compute_metrics,20)
    # 1. save T5 model and config
    pruned_model.save_pretrained(model_name, from_pt=True)
    pruned_model.config.save_pretrained(model_name, from_pt=True)
    # 2. save model_state_dict (save mask)
    torch.save(pruned_model.state_dict(), model_name+'/model_state_dict.pth')
    model = pruned_model

60.506624 M
0.772869496073686
60.506624 M
0.6055597978826253
60.506624 M
0.48116442589822894
60.506624 M
0.38770331988775314
60.506624 M
0.31666774203102127


In [27]:
parameters_to_prune = {
    'encoder_T5LayerSelfAttention':[], 
    'encoder_T5LayerFF':[],
    'decoder_T5LayerSelfAttention':[], 
    'decoder_T5LayerCrossAttention':[],
    'decoder_T5LayerFF':[],
    'lm_head':[],
}
amounts_to_prune = [
        0,
        0,
        0.1,
        0.1,
        0.1,
        0.2,
    ]
#model = load_pruned_model('pruned_model_V5')
pruned_model = puring(model,parameters_to_prune,amounts_to_prune)
model_name = 'pruned_model_V6'
pruned_model, pruned_trainer = training_pruned_model(pruned_model,'pruned_billsum_model_V6',tokenized_billsum,tokenizer,data_collator,compute_metrics,40)
# 1. save T5 model and config
pruned_model.save_pretrained(model_name, from_pt=True)
pruned_model.config.save_pretrained(model_name, from_pt=True)
# 2. save model_state_dict (save mask)
torch.save(pruned_model.state_dict(), model_name+'/model_state_dict.pth')

60.506624 M
0.29887937558704314


In [303]:
checkpoint = 'pruned_model_V6'
pruned_model = load_pruned_model(checkpoint)
show_param_ratio(pruned_model)

Some weights of the model checkpoint at pruned_model_V5 were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.0.SelfAttention.k.weight_mask', 'decoder.block.0.layer.0.SelfAttention.k.weight_orig', 'decoder.block.0.layer.0.SelfAttention.o.weight_mask', 'decoder.block.0.layer.0.SelfAttention.o.weight_orig', 'decoder.block.0.layer.0.SelfAttention.q.weight_mask', 'decoder.block.0.layer.0.SelfAttention.q.weight_orig', 'decoder.block.0.layer.0.SelfAttention.v.weight_mask', 'decoder.block.0.layer.0.SelfAttention.v.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.k.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.k.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.o.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.o.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.q.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.q.weight_orig', 'decoder.block.0.layer.1.EncDecAttention.v.weight_mask', 'decoder.block.0.layer.1.EncDecAttention.v.w

60.506624 M
0.29938692332264316


## Prediction Part

In [None]:
billsum_test = load_dataset("billsum", split="test")
tokenized_billsum_test = billsum_test.map(preprocess_function, batched=True)

Map: 100%|██████████| 32/32 [00:00<00:00, 1056.34 examples/s]
Map: 100%|██████████| 3237/3237 [00:01<00:00, 2105.63 examples/s]


In [261]:
pruned_trainer.evaluate(tokenized_billsum_test['train'],max_length = 20)

{'eval_loss': 3.982614517211914,
 'eval_rouge1': 0.1934,
 'eval_rouge2': 0.0898,
 'eval_rougeL': 0.1611,
 'eval_rougeLsum': 0.1608,
 'eval_gen_len': 19.0,
 'eval_runtime': 4.4159,
 'eval_samples_per_second': 7.247,
 'eval_steps_per_second': 3.623,
 'epoch': 20.0}

In [236]:
pruned_trainer.evaluate(tokenized_billsum_test['train'],max_length = 50)

{'eval_loss': 4.0735673904418945,
 'eval_rouge1': 0.3314,
 'eval_rouge2': 0.1116,
 'eval_rougeL': 0.2501,
 'eval_rougeLsum': 0.2496,
 'eval_gen_len': 48.875,
 'eval_runtime': 7.9999,
 'eval_samples_per_second': 4.0,
 'eval_steps_per_second': 2.0}

In [244]:
pruned_trainer.evaluate(tokenized_billsum_test['train'],max_length = 70)

{'eval_loss': 4.0735673904418945,
 'eval_rouge1': 0.3312,
 'eval_rouge2': 0.1059,
 'eval_rougeL': 0.2434,
 'eval_rougeLsum': 0.2436,
 'eval_gen_len': 68.25,
 'eval_runtime': 10.1828,
 'eval_samples_per_second': 3.143,
 'eval_steps_per_second': 1.571}

In [230]:
pruned_trainer.evaluate(tokenized_billsum_test['train'],max_length=100)

{'eval_loss': 4.0735673904418945,
 'eval_rouge1': 0.2946,
 'eval_rouge2': 0.0913,
 'eval_rougeL': 0.2288,
 'eval_rougeLsum': 0.2294,
 'eval_gen_len': 97.3125,
 'eval_runtime': 10.4055,
 'eval_samples_per_second': 3.075,
 'eval_steps_per_second': 1.538}

In [231]:
pruned_trainer.evaluate(tokenized_billsum_test['train'],max_length=200)

{'eval_loss': 4.0735673904418945,
 'eval_rouge1': 0.2624,
 'eval_rouge2': 0.0801,
 'eval_rougeL': 0.2106,
 'eval_rougeLsum': 0.2108,
 'eval_gen_len': 142.0938,
 'eval_runtime': 13.9898,
 'eval_samples_per_second': 2.287,
 'eval_steps_per_second': 1.144}

In [232]:
pruned_trainer.evaluate(tokenized_billsum_test['train'],max_length=300)

{'eval_loss': 4.0735673904418945,
 'eval_rouge1': 0.2497,
 'eval_rouge2': 0.0757,
 'eval_rougeL': 0.2006,
 'eval_rougeLsum': 0.2007,
 'eval_gen_len': 170.2188,
 'eval_runtime': 16.8414,
 'eval_samples_per_second': 1.9,
 'eval_steps_per_second': 0.95}

In [None]:
results = pruned_trainer.predict(tokenized_billsum_test['train'],max_length=128)

In [182]:
results = np.where(results[0] != -100, results[0], tokenizer.pad_token_id)

In [183]:
decoded_prediction = tokenizer.batch_decode(results, skip_special_tokens=True)


In [163]:
import pandas as pd
import csv

In [184]:
df_results = pd.DataFrame(columns=['ID','Predict'])

for i, prediction in enumerate(decoded_prediction):
    # Escape quotes by replacing "," with "."
    summary_escaped = prediction.replace(',', '.')
    
    # Create a new row DataFrame and append it
    new_row = pd.DataFrame({'ID': [i], 'Predict': [summary_escaped]})
    df_results = pd.concat([df_results, new_row], ignore_index=True)

# Print the resulting DataFrame
print(df_results)

    ID                                            Predict
0    0  Transportation Security Act of 2007 - Amends t...
1    1  Refuges Act of 2016 This bill amends the Immig...
2    2  Foster Foster Foster Foster Act - Directs the ...
3    3  Directs the Secretary of Defense to establish ...
4    4  Government Government Accountability Act - Req...
5    5  National Coin Coin Act - Directs the Secretary...
6    6  National Housing Housing Act of 2013 - Directs...
7    7  Directs the Attorney General to establish a pr...
8    8  Micro Microuse Act of 2001 - Amends the Small ...
9    9  Private Private Private Privat Privat Privatiz...
10  10  Cyber Cyber Cyber Cyber Cyber Cyber Cyber Cybe...
11  11  Amends the Internal Revenue Code to establish ...
12  12  National Neuro Neuro Neuroforensicforensicfore...
13  13  Amends the Internal Revenue Code to allow a dr...
14  14  Energy Energy Energy Act of 2007 - Amends the ...
15  15  Fairness Act - Directs the Attorney General to...
16  16  Insura

In [185]:
# Function to escape double quotes and handle newlines
def escape_special_characters(text):
    return text.replace('"', '""').replace('\n', ' ')

# Apply escaping to the 'Summary' column
df_results['Predict'] = df_results['Predict'].apply(escape_special_characters)

### Dump Prediction

In [176]:
df_results.to_csv('test.csv', index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
#df_results = pd.read_csv('test.csv',encoding='utf-8')

### Calculating ROUGE-Lsum with build-in Python function

In [186]:
def calculate_lcs(X, Y):
    """
    Helper function to calculate the longest common subsequence of sequences X and Y.
    """
    m, n = len(X), len(Y)
    L = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

    return L[m][n]

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Computes the ROUGE-Lsum score based on the longest common subsequence summed over all sentences in the summaries.
    
    Args:
    solution (pd.DataFrame): The DataFrame containing the correct summaries.
    submission (pd.DataFrame): The DataFrame containing participant's predicted summaries.
    row_id_column_name (str): The column name for the row ID in both DataFrames.

    Returns:
    float: The mean ROUGE-Lsum score across all predictions.
    """
    # Ensure indices for proper alignment
    solution.set_index(row_id_column_name, inplace=True)
    submission.set_index(row_id_column_name, inplace=True)

    total_score = 0

    for idx in solution.index:
        if idx not in submission.index:
            raise ParticipantVisibleError(f"Missing prediction for ID {idx}.")

        ref_summary = solution.loc[idx, 'Label']
        pred_summary = submission.loc[idx, 'Predict']

        # Tokenize sentences
        ref_sentences = ref_summary.split('.')
        pred_sentences = pred_summary.split('.')

        # Calculate LCS for each sentence pair
        lcs_sum = 0
        for ref_sent in ref_sentences:
            ref_tokens = ref_sent.strip().lower().split()
            best_lcs = 0
            for pred_sent in pred_sentences:
                pred_tokens = pred_sent.strip().lower().split()
                lcs_length = calculate_lcs(ref_tokens, pred_tokens)
                best_lcs = max(best_lcs, lcs_length)
            lcs_sum += best_lcs

        # Calculate ROUGE-L for the current pair of summaries
        ref_length = sum(len(sent.strip().split()) for sent in ref_sentences)
        if ref_length > 0:
            rouge_l = lcs_sum / ref_length
        else:
            rouge_l = 0
        total_score += rouge_l

    # Compute the average ROUGE-L score across all submissions
    mean_rouge_lsum = total_score / len(solution)

    return mean_rouge_lsum

In [187]:
df_label = pd.DataFrame(columns=['ID','Label'])

for i, label in enumerate(billsum_test['train']):
    # Escape quotes by replacing "," with "."
    label_escaped = label['summary'].replace(',', '.')
    
    # Create a new row DataFrame and append it
    new_row = pd.DataFrame({'ID': [i], 'Label': [label_escaped]})
    df_label = pd.concat([df_label, new_row], ignore_index=True)

# Print the resulting DataFrame
print(df_label)

    ID                                              Label
0    0  Securing America's Facilities. Equipment and R...
1    1  Liberian Refugee Immigration Fairness Act of 2...
2    2  Every Child Deserves a Family Act - Prohibits ...
3    3  Directs the United States to: (1) take all nec...
4    4  Spending Reduction Act - Requires the head of ...
5    5  National Purple Heart Hall of Honor Commemorat...
6    6  Eleanor Smith Inclusive Home Design Act of 201...
7    7  Directs the Attorney General to:  (1) conduct ...
8    8  Amends the Riegle Community Development and Re...
9    9  Protection of Homes. Small Businesses. and Pri...
10  10  Seniors Financial Fraud Prevention Act of 2010...
11  11  Competitive and Open Markets That Protect and ...
12  12  National Neurological Diseases Surveillance Sy...
13  13  Deadly Driver Reduction Act - Amends Federal t...
14  14  Recovery Through Building Renovation Act of 20...
15  15  Data Broker Accountability and Transparency Ac...
16  16  Taxpay

In [180]:
score(df_label, df_results, 'ID')

0.2400331001863319

In [188]:
score(df_label, df_results, 'ID')

0.2400331001863319