In [12]:
from transformers import pipeline
from datasets import load_dataset
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 
from tqdm import tqdm 
#from llama_cpp import Llama
from transformers import LlamaTokenizer, MistralForCausalLM
import bitsandbytes, flash_attn

In [2]:
news_dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [3]:
news_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
news_dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [5]:
news_dataset["train"][2]

{'article': 'MINNEAPOLIS, Minnesota (CNN) -- Drivers who were on the Minneapolis bridge when it collapsed told harrowing tales of survival. "The whole bridge from one side of the Mississippi to the other just completely gave way, fell all the way down," survivor Gary Babineau told CNN. "I probably had a 30-, 35-foot free fall. And there\'s cars in the water, there\'s cars on fire. The whole bridge is down." He said his back was injured but he determined he could move around. "I realized there was a school bus right next to me, and me and a couple of other guys went over and started lifting the kids off the bridge. They were yelling, screaming, bleeding. I think there were some broken bones."  Watch a driver describe his narrow escape » . At home when he heard about the disaster, Dr. John Hink, an emergency room physician, jumped into his car and rushed to the scene in 15 minutes. He arrived at the south side of the bridge, stood on the riverbank and saw dozens of people lying dazed on 

In [6]:
##Retrieve the dataset with 2000 data points

In [7]:
def retrieve_dataset(news_dataset,rows=2000):
    article=[]
    highlights=[]
    
    for articles in range(rows):
        article.append(news_dataset["train"][articles]["article"])
        highlights.append(news_dataset["train"][articles]["highlights"])
        
    extracted_info = pd.DataFrame({
    'article': article,
    'highlights': highlights})
    
    return extracted_info
                 

In [8]:
news_dataframe=retrieve_dataset(news_dataset)

In [9]:
##Run the First LLM "microsoft/Phi-3.5-MoE-instruct" for extractive summarization 

In [10]:
import torch
print(torch.cuda.is_available())


True


In [None]:
torch.random.manual_seed(0) 

model = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3.5-MoE-instruct",  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=False,  
) 

model = model.to("cuda")

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct") 


In [15]:
generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    ##"temperature": 0.0, 
    "do_sample": False, 
} 
pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

In [None]:
summary_phi=[]
for i in tqdm(range(len(news_dataframe[:310]))):
    prompt_phi= f""" You are a helpful AI assistant that summarizes news articles accurately and concisely.
                
                 Given the article, generate a detailed 7-8 lines extractive summary in the form of a Python string.
                 Never ever explain yourself,just give me the summary. 
                 Ensure the summary captures all key points and details from the article.
                 DO NOT CONTINUE GENERATION AFTER GIVING THE OUTPUT.

                 Here is my article:
                 article: {news_dataframe.iloc[i]['article']} \n
                

                 """
    messages = [ {"role": "user", "content" : prompt_phi } ]

    output = pipe(messages, **generation_args) 
    
    summary_phi.append(output[0]['generated_text'])


In [None]:
#Convert the list of elements in Summary_Phi to a dataframe 
df_1 = pd.DataFrame(summary_phi, columns=["Summary_Phi"])

# Save the DataFrame as a CSV file without the index
df_1.to_csv("summary_phi.csv", index=False)

In [None]:
##Run the Second LLM "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" for abstractive summarization using Llama.cpp

In [23]:

torch.random.manual_seed(0) 

tokenizer = LlamaTokenizer.from_pretrained('NousResearch/Hermes-2-Pro-Mistral-7B', trust_remote_code=False)
model = MistralForCausalLM.from_pretrained(
    "NousResearch/Hermes-2-Pro-Mistral-7B",
    torch_dtype=torch.float16,
    device_map={"": "cuda:0"}, 
    load_in_8bit=True,  
    load_in_4bit=False,
    use_flash_attention_2=False,
    trust_remote_code=False
    
)
print(next(model.parameters()).device)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]

cuda:0





In [24]:
generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    ##"temperature": 0.0, 
    "do_sample": False, 
} 
pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

In [None]:
summary_mistral=[]
for i in tqdm(range(len(news_dataframe[:310]))):
    prompt_mistral= f""" You are a helpful AI assistant that summarizes news articles accurately and concisely.
                
                 Given the article, generate a detailed 7-8 lines abstract summary in the form of a Python string.
                 Never ever explain yourself,just give me the summary. 
                 Ensure the summary captures all key points and details from the article.
                 DO NOT CONTINUE GENERATION AFTER GIVING THE OUTPUT.

                 Here is my article:
                 article: {news_dataframe.iloc[i]['article']} \n
                

                 """
    messages = [ {"role": "user", "content" : prompt_mistral } ]

    output = pipe(messages, **generation_args) 
    
    summary_mistral.append(output[0]['generated_text'])

In [30]:
#Convert the list of elements in Summary_mistral to a dataframe 
df_2 = pd.DataFrame(summary_mistral, columns=["Summary_mistral"])

# Save the DataFrame as a CSV file without the index
df_2.to_csv("summary_mistral.csv", index=False)

In [None]:
#deepseek-ai/DeepSeek-R1-Distill-Qwen-14B

In [None]:
torch.random.manual_seed(0) 

tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
     torch_dtype="auto", 
     device_map={"": "cuda:0"}, 
     trust_remote_code=True
    
)
print(next(model.parameters()).device)


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    ##"temperature": 0.0, 
    "do_sample": False, 
} 
pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

In [None]:
summary_deepseek_qwen=[]
for i in tqdm(range(len(news_dataframe[:310]))):
    prompt_deepseek_qwen= f""" You are a helpful AI assistant that summarizes news articles accurately and concisely.
                
                 Given the article, generate a detailed 7-8 lines abstract summary in the form of a Python string.
                 Never ever explain yourself,just give me the summary. 
                 Ensure the summary captures all key points and details from the article.
                 DO NOT CONTINUE GENERATION AFTER GIVING THE OUTPUT.

                 Here is my article:
                 article: {news_dataframe.iloc[i]['article']} \n
                

                 """
    messages = [ {"role": "user", "content" : prompt_deepseek_qwen } ]

    output = pipe(messages, **generation_args) 
    
    summary_deepseek_qwen.append(output[0]['generated_text'])