In [1]:
import pandas as pd
import numpy as np


case = pd.read_csv('case_clean.csv')
case.head(2)

Unnamed: 0.1,Unnamed: 0,CorpNo,CaseID,cleaned_title,cleaned_description
0,0,11918,3202532,Phone Call From,"Hello, this is the same Thing. Why, why Why Is..."
1,1,11918,3202536,to check workfow,where is north hemisphere


# BART

In [10]:
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
def summarize_text(text):
    # Load BART model and tokenizer
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Tokenize the text and generate summary ids
    inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=30, max_length=150, early_stopping=True)

    # Decode the summary ids and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [14]:
%%time
test_size = 5

progress_bar = tqdm(total=test_size, desc="Summarizing messages")
for i in range(test_size):
    
    message = case.cleaned_description.values[i]
    summary = summarize_text(message)
    progress_bar.update(1)
    #print('original text:\n')
    #print(f'{message}\n')
    #print('summarized text:\n')
    #print(f'{summary}\n')
    
progress_bar.close()

Summarizing messages: 100%|██████████| 5/5 [00:39<00:00,  7.81s/it]

CPU times: user 1min 3s, sys: 6.6 s, total: 1min 10s
Wall time: 39.1 s





# T5

In [39]:
from transformers import pipeline

max_length = 100
min_length = 1
model_id = "marianna13/flan-t5-base-summarization"

summarizer = pipeline("summarization", model=model_id, max_length=max_length, min_length=min_length)

In [40]:
%%time
test_size = 10

#progress_bar = tqdm(total=test_size, desc="Summarizing messages")
for i in range(test_size):
    
    message = case.cleaned_description.values[i]
    summary = summarizer(message)[0]['summary_text']
    #progress_bar.update(1)
    print('original text:\n')
    print(f'{message}\n')
    print('summarized text:\n')
    print(f'{summary}\n')
    
progress_bar.close()

Your max_length is set to 100, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 100, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


original text:

Hello, this is the same Thing. Why, why Why Is it the Only Reason That you can Check the check check it should Be checked. 

summarized text:

"Hello, this is the same thing. Why, why is it the only reason that you can check the check check it should be checked?"



Your max_length is set to 100, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


original text:

where is north hemisphere 

summarized text:

"North hemisphere" is a region in the Northern Hemisphere.

original text:

Declination will allow of your district still received by Getting your waistcoat integration press and that would be the receiving end of the Way To to Me. 

summarized text:

"Declination will allow for your district still receiving by Getting your waistcoat integration press, which would be the receiving end of the Way To to Me."



Your max_length is set to 100, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


original text:

We found some Pins we think might be right up your alley. Pins for you in Hairstyles My Face uploaded ...My Face uploaded by Firlina on We Heart ItBest Half Up Half...Best Half Up Half Down Hairstyles For Everyday To Special...Priyanka and Swap...Priyanka and Swapnil | The Park Hotel Navi Mumbai | Mumba... See more Pins Are you interested in this topicHairstyles InterestedNo Thanks Based on what's popular on Pinterest 

summarized text:

"We found some Pins we think might be right up your alley in Hairstyles My Face uploaded by Firlina on We Heart ItBest Half Up Half...Best Half Down Half Down HairstyleS For Everyday To Special...Priyanka and SwapNil | The Park Hotel Navi Mumbai | Mumba... See more Pins Are you interested in this topicHairstyles InterestedNo Thanks Based on what's popular on Pinterest."

original text:

Lp dialogue Testing work. 

summarized text:

"Lp dialogue testing work."

original text:

The new status page will replace all current status page link

Your max_length is set to 100, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)


original text:

We didn't see any writing activity last week. JUNE 26 - JULY 02 Your Weekly Writing Update We're not seeing any writing activity for you last week, so unless you were taking a writing break, you might have accidentally logged out. Please log back in so we can keep you up to date on your personal records and general greatness. Grammarly writing streak 0 weeks26Next AchievementSee all achievements › Productivity No activity detected for last week. Please make sure you are logged in.0word checked Jun 11 18 Jun 25 02 Mastery No activity detected for last week. Please make sure you are logged in.0alert shown Jun 11 18 Jun 25 02 vocabulary No activity detected for last week. Please make sure you are logged in.0 unique word used Jun 11 18 Jun 25 02 WORDS CHECKED WITH GRAMMARLY OVER TIME 536,419 total words checked by Grammarly since Jan 01, 1753 (0 last week) Elevate your writing with advanced suggestions from Grammarly Premium. Upgrade to Fix Advanced IssuesGo Premium this we

Your max_length is set to 100, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


original text:

mohit test 1 test 

summarized text:

"Mohit test 1" is a test that is based on a mohit test.



Your max_length is set to 100, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


original text:

From: Megha Test 06 July 2022 20:25To: Megha Ankushe chained email dialogue okcheck 

summarized text:

"Megha Test 06 July 2022 20:25 To: Megha Ankushe chained email dialogue okcheck."

original text:

What's on your mind 

summarized text:

"What's on your mind?"

CPU times: user 1min 27s, sys: 119 ms, total: 1min 27s
Wall time: 22 s


# GPT

In [42]:
#!pip install openai

In [48]:
import openai

# Initialize the OpenAI client with your API key
openai.api_key = "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"

def gpt3_summarize(text):
    # Prompt GPT-3 to summarize the text
    response = openai.Completion.create(
        engine="curie",  # You can use other engines if you prefer
        prompt=f"Summarize the following text for me:\n\n{text}\n",
        max_tokens=150  # Adjust based on how long you want the summary to be
    )
    
    # Return the summarized text
    return response.choices[0].text.strip()


In [49]:
%%time
test_size = 10

#progress_bar = tqdm(total=test_size, desc="Summarizing messages")
for i in range(test_size):
    
    message = case.cleaned_description.values[i]
    summary = gpt3_summarize(message)
    #progress_bar.update(1)
    print('original text:')
    print(f'{message}\n')
    print('summarized text:')
    print(f'{summary}\n')
    
progress_bar.close()

original text:
Hello, this is the same Thing. Why, why Why Is it the Only Reason That you can Check the check check it should Be checked. 

summarized text:
Now let's add some text to hear what we do with FoLs:

We have applied for a new mailing list!

A Web Content Manager (WTFPL) FAQ is sufficient to replace the widely circulated and increasingly maligned (MTBF API usage) article about Blog posts.

Multinational corporations sometimes contract with . . . .

Keep your eyes and ears open for even the most thoughtful person becomes very stubborn about this or that. Much like German steel and Space Shuttles, we tend to stay with the assumptions we know and dislike those that are new.

. . . ·not provide services that fail to meet the standards established by World Association of Girl Guides and Brownies . . .

original text:
where is north hemisphere 

summarized text:
using lens

which refocuses the Myriad 1 imagery shown above (look for the red arrows):

IDENTICAL FLARE LOCATION. Excep

# LLAMA2

In [51]:
#!pip install langchain==0.0.191
#!pip install llama-cpp-python==0.1.66
#!pip install sentence-transformers
#!pip install huggingface_hub
#!pip install auto-gptq==0.2.2
#!pip install transformers torch accelerate
#!pip install peft

In [1]:
!pip install -q transformers einops accelerate langchain bitsandbytes

In [2]:
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/jupyter/.cache/huggingface/token
Login successful


In [3]:
!huggingface-cli whoami

Kaitong


In [4]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer
import transformers
import torch

In [5]:
model = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
llm = HuggingFacePipeline(pipeline=pipeline, model_kwargs={'temperature': 0})

In [7]:
from langchain import PromptTemplate, LLMChain

In [8]:
template = """
                Write a concise summary of the following text. 
                The length of the summarized text should not exceed that of the original text.
                '''{text}'''
                Summary:
            """

prompt = PromptTemplate(template=template, input_variables=['text'])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [9]:
%%time
test_size = 10

#progress_bar = tqdm(total=test_size, desc="Summarizing messages")
for i in range(test_size):
    
    message = case.cleaned_description.values[i]
    summary = llm_chain.run(message)
    #progress_bar.update(1)
    print('original text:')
    print(f'{message}\n')
    print('summarized text:')
    print(f'{summary}\n')
    
progress_bar.close()

original text:
Hello, this is the same Thing. Why, why Why Is it the Only Reason That you can Check the check check it should Be checked. 

summarized text:
 This text is about the importance of checking something, with the repetition of the phrase "why" emphasizing the urgency of the matter.
    """
    The text is about the importance of checking something, with the repetition of the phrase "why" emphasizing the urgency of the matter.



OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.58 GiB total capacity; 13.41 GiB already allocated; 7.38 MiB free; 13.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer
import torch
from transformers import pipeline

In [12]:
DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE_TYPE)

cuda


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM


pipe = pipeline("text-generation", model="austinm2151/Llama2_Summarizer")

tokenizer = AutoTokenizer.from_pretrained("austinm2151/Llama2_Summarizer")
model = AutoModelForCausalLM.from_pretrained("austinm2151/Llama2_Summarizer")

# Mistral-7b

In [1]:
!pip install -q -U trl accelerate git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git
!pip install -q datasets bitsandbytes wandb

In [2]:
!pip install -q datasets

In [3]:
!nvidia-smi

Sun Oct 29 22:05:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      4MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

pipe = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")