# Preparing Environment

In [1]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [2]:
# Install dependencies.
!pip install llmlingua openai==1.14.1 optimum auto-gptq



In [3]:
import json
import time
from openai import OpenAI
import pandas as pd

# Clients Setup

In [4]:
client = OpenAI(
    # This is the default and can be omitted
    api_key="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"
)

In [5]:
# Setup LLMLingua
from llmlingua import PromptCompressor

llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})

The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


# Normalize Dataframe

In [6]:
meetings_df = pd.read_csv('./meetings.csv')
# FILTERING BY SESSION 25355
# meetings_df = meetings_df[meetings_df["id_session"] == 25345]

In [7]:
def transform_df_to_speeches_list(text_df):
    speakers = text_df['speaker_name']
    speeches = text_df['speech']
    
    speeches_list = ""
    for index, _ in enumerate(speakers.iteritems()):
        speeches_list += f"<llmlingua, compress=False>{speakers.iloc[index]}</llmlingua>: {speeches.iloc[index]}\n"
    return speeches_list

def get_speakers_list_from_dataframe(df):
    return list(set(df['speaker_name']))

# Prompt

In [8]:
def summarization_prompt_for_stances(context: str, dataframe):
    speakers_list = get_speakers_list_from_dataframe(dataframe)
    speakers_string = ', '.join(speakers_list)
    
    prompt = f"""
    Consider that it is an expert model in Stance Detection. Stance detection is the task of predicting an author's point of view on a subject of interest. A speech can represent one of four types of stance: for, against or neutral.
For: When an author takes a stance "for" a subject, it means they support or advocate for it. Their speech or writing will likely include arguments, evidence, or opinions that highlight the positive aspects, benefits, or reasons to endorse the subject. For example, if the subject is a proposed policy change, someone taking a "for" stance might emphasize how it could improve people's lives or address important societal issues.
Against: This stance indicates opposition or disagreement with the subject at hand. Authors taking an "against" stance will present arguments, evidence, or opinions that highlight flaws, risks, negative consequences, or reasons to reject the subject. Using the previous example of a proposed policy change, someone taking an "against" stance might argue that it would be ineffective, unfair, or harmful to certain groups.
Neutral: A neutral stance means the author does not express explicit support or opposition towards the subject. They may present information, analysis, or perspectives in a balanced and objective manner without advocating for or against the subject. Neutral stances typically avoid strong opinions or judgments and instead focus on providing a comprehensive understanding of the topic without bias. If the person doesn't say anything about that topic, it means that they should not be listed.
Reply in json format with the following keys: list_latent_topics, stances and summary.
list_latent_topics: should contain the list of all topics discussed in the text, and a short description for each topic.
stances: for each latent_topics key should contain the list of classification of the related speaker's speechs.
summary: should contain the summary of the text.
    Consider that you will receive as input a text with a set of speeches that make up ```{context}```.

    Do the following actions for the text:
    - Determine the all topics being discussed in the text and a brief descriptions of these topics.
    - For each topic and for each speaker, except if the person doesn't say anything about that topic, classify the stance as being FOR, AGAINST, NEUTRAL. Being the following speakers: {speakers_string}.
    - Before your response, translate the summary and the topics to portuguese.
   """
    return prompt

# Compressing

In [9]:
def compress_prompt_for_df(id_session: int, dataframe):
    start_time = time.time()
    
    TARGET_TOKEN = 16000
    speeches_list = transform_df_to_speeches_list(dataframe)
    prompt = summarization_prompt_for_stances('parliamentary session', dataframe)
    
    compressed_prompt = llm_lingua.compress_prompt(
        speeches_list.split("\n"),
        question=prompt,
        target_token=TARGET_TOKEN,
        condition_compare=True,
        condition_in_question="after",
        rank_method="longllmlingua",
    )
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    return {
        "id_session": int(id_session),
        "elapsed_time": float(elapsed_time),
        **compressed_prompt
    }

# Compressing all sessions

In [11]:
# SLICE TO NOT START FROM THE BEGINNING
id_sessions = meetings_df["id_session"].unique()

total = len(id_sessions)
start_time = time.time()

ignored_sessions = []

for index, id_session in enumerate(id_sessions):
    print(f'Session {id_session} started!')
    session_df = meetings_df[meetings_df["id_session"] == id_session]
    
    if session_df["speaker_name"].isnull().values.all():
        print(f'Session {id_session} ignored!')
        ignored_sessions.append(int(id_session))
        print('=-=-=-=-=')
        continue
    
    response = compress_prompt_for_df(id_session, session_df)
    
    with open(f'./compressed_prompts_v3/{id_session}.json', 'w') as file:
        file.write(json.dumps(response, indent=4))
        
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f'Session {id_session} compressed!')
    print(f'{index + 1} / {total} completed!')
    print(f'{elapsed_time} seconds elapsed!')
    print('=-=-=-=-=')

with open('./compressed_prompts_v3/ignored_sessions.json', 'w') as file:
    file.write(json.dumps({ "sessions": ignored_sessions }, indent=4))

Session 25341 started!
Session 25341 compressed!
1 / 204 completed!
100.79918241500854 seconds elapsed!
=-=-=-=-=
Session 25345 started!
Session 25345 compressed!
2 / 204 completed!
129.34575271606445 seconds elapsed!
=-=-=-=-=
Session 25346 started!
Session 25346 compressed!
3 / 204 completed!
209.03612208366394 seconds elapsed!
=-=-=-=-=
Session 25347 started!
Session 25347 compressed!
4 / 204 completed!
263.2016808986664 seconds elapsed!
=-=-=-=-=
Session 25355 started!
Session 25355 compressed!
5 / 204 completed!
374.4360854625702 seconds elapsed!
=-=-=-=-=
Session 25353 started!
Session 25353 compressed!
6 / 204 completed!
423.49589681625366 seconds elapsed!
=-=-=-=-=
Session 25373 started!
Session 25373 compressed!
7 / 204 completed!
540.6259634494781 seconds elapsed!
=-=-=-=-=
Session 25383 started!
Session 25383 compressed!
8 / 204 completed!
588.9508147239685 seconds elapsed!
=-=-=-=-=
Session 25384 started!
Session 25384 compressed!
9 / 204 completed!
745.3648746013641 second