In [1]:
import os 
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd # type: ignore

In [2]:
from utils import get_dataset, get_model_by_tag, get_prompt_template
from LLMAnnotator import LLMAnnotator

In [3]:
os.environ['OPENAI_API_KEY_CLARIN']=os.getenv("OPENAI_API_KEY_CLARIN")
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")

token = os.getenv("OPENAI_API_KEY_CLARIN")

In [4]:
import os
import pandas as pd
import re

def extract_bracketed_word(df, read_col, write_col):
    def extract_last_bracketed(text):
        matches = re.findall(r'\[([^\[\]]+?)\]', str(text))
        return matches[-1] if matches else None
    df[write_col] = df[read_col].apply(extract_last_bracketed)
    return df

def validate_text_column(df, df_reference):
    reference_texts = set(df_reference['text'].unique())
    df['was_in_selected_samples'] = df['text'].isin(reference_texts)
    df = extract_bracketed_word(df, 'output', 'extracted_label')
    return df

for i in [2]:
    print('\n\nMUMER', i, '\n\n')
    
    model_tag = 'llama3'
    dataset_tag = '20_newsgroups'
    prompt = 'cot_random_samples_cohere'
    temp = 0.3
    number_of_experimet = i
    selected_samples = f'{model_tag}_random42_80'
    
    model = get_model_by_tag(model_tag, token, temp)
    prompt_txt = get_prompt_template(dataset_tag, prompt)
    examples_for_prompt = pd.read_csv(f'./selected_samples/{dataset_tag}/{selected_samples}.csv')

    output_path = f'./results/{prompt}_temp{temp}/{model_tag}/{dataset_tag}_{model_tag}_{prompt}_{selected_samples}_temp{temp}_exp{number_of_experimet}.csv'
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    try:
        df_old = pd.read_csv(output_path)
    except FileNotFoundError:
        df_old = pd.DataFrame(columns=['text', 'label', 'output'])

    # Dodaj 'original_index', żeby móc później przywrócić kolejność
    full_dataset = get_dataset(dataset_tag=dataset_tag)
    full_dataset = full_dataset.reset_index().rename(columns={'index': 'original_index'})

    # Wczytaj stare wyniki lub stwórz pusty DataFrame
    try:
        df_old = pd.read_csv(output_path)
    except FileNotFoundError:
        df_old = pd.DataFrame(columns=['text', 'label', 'output', 'error'])
        
    # Do zaannotowania:
    # - brak tekstu w df_old
    # - lub w df_old jest 'error' niepuste dla danego textu
    df_old = validate_text_column(df_old, examples_for_prompt)
    
    if 'error' not in df_old.columns:
        df_old['error'] = None
    texts_with_errors = df_old[df_old['error'].notna() & (df_old['error'].astype(str) != '')]['text']
    texts_already_done = set(df_old['text']) - set(texts_with_errors)
    df_to_annotate = full_dataset[~full_dataset['text'].isin(texts_already_done)]

    print('Do zaannotowania:', len(df_to_annotate))


    if not df_to_annotate.empty:
        annotator = LLMAnnotator(
            model=model,
            dataset=df_to_annotate,
            examples_for_prompt=examples_for_prompt,
            prompt_template=prompt_txt,
            column_text="text",
            column_label="label",
            column_output="output"
        )
        df_new = annotator.get_results()
        df_new = validate_text_column(pd.DataFrame(df_new), examples_for_prompt)

        # Łączenie z zachowaniem indeksu
        
        df_new = df_new.merge(df_to_annotate[['text', 'original_index']], on='text', how='left')

        df_old = df_old.merge(full_dataset[['text', 'original_index']], on='text', how='left')
        
        df_combined = pd.concat([df_old, df_new], ignore_index=True)
        df_combined.drop_duplicates(subset='original_index', keep='last', inplace=True)
        df_combined.sort_values(by='original_index', inplace=True)
        
        # Usuń kolumnę 'error', jeśli wszystkie wartości są puste lub puste stringi
        if 'error' in df_combined.columns and df_combined['error'].replace('', pd.NA).isna().all():
            df_combined.drop(columns='error', inplace=True)

        df_combined.to_csv(output_path, index=False)
        print("Zaktualizowano plik:", output_path)

    else:
        print("Brak nowych przykładów do anotacji.")




MUMER 2 


Do zaannotowania: 1250


  self.chain = LLMChain(llm=self.model, prompt=self.prompt)


Nr: 0 Predicted label: To determine the most suitable category for the provided text, let's break down the steps as suggested:

**Step 1: Summarize the main topic of the text.**
The text discusses economics, focusing on supply-side economics, deficits, GNP (Gross National Product), tax spending, and trends during various presidential administrations (Carter, Reagan, Bush). It touches on the concept of balancing budgets through freezing spending and maintaining tax pledges.

**Step 2: Identify any key terms or phrases that hint at a specific category.**
Key terms include "supply side economic," "deficits," "GNP," "tax spending," "budget," and references to U.S. presidents and their economic policies.

**Step 3: Compare the topic to the definitions of each category.**
Given the topics discussed, this text aligns with discussions about politics, particularly those involving economic policies and governmental financial management.

**Step 4: Choose the most appropriate category based on yo

Process Process-4:
Traceback (most recent call last):
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/julita/Desktop/anote-skrpyty 19.04/LLMAnnotator.py", line 13, in call_chain_process
    result = chain.invoke({"text": text})
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain/chains/base.py", line 170, in invoke
    raise e
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain/chains/base.py", line 160, in invoke
    self._call(inputs, run_manager=run_manager)
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain/chains/llm.py", line 126, in _call
    response = self.generate([inputs], run_manager=run_manager)
              

KeyboardInterrupt: 

  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain_core/language_models/llms.py", line 950, in generate
    output = self._generate_helper(
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain_core/language_models/llms.py", line 792, in _generate_helper
    raise e
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain_core/language_models/llms.py", line 779, in _generate_helper
    self._generate(
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/langchain_core/language_models/llms.py", line 1504, in _generate
    else self._call(prompt, stop=stop, **kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/julita/Desktop/anote-skrpyty 19.04/Amodels/Llama3_1LLM.py", line 31, in _call
    response = requests.post(
               ^^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/requests/api.py", lin

  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/requests/adapters.py", line 667, in send
    resp = conn.urlopen(
           ^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/site-packages/urllib3/connection.py", line 507, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/http/client.py", line 1428, in getresponse
    response.begin()
  File "/home/julita/anaconda3/envs/anote/lib/python3.12/http/client.py", line 331, in begin
    version, status, reason = 

In [None]:
output_path

'./results/cot_random_samples_cohere_temp0.3/llama3/20_newsgroups_llama3_cot_random_samples_cohere_llama3_random42_80_temp0.3_exp1.csv'

In [None]:
df_combined

Unnamed: 0,text,output,logprobs,top_logprobs,original_label,original_index,was_in_selected_samples,extracted_label
0,article ashish arora writes excerpts netnewssc...,To determine the most appropriate category for...,"{'content': [{'token': 'To', 'bytes': [84, 111...","[{'token': 'To', 'bytes': [84, 111], 'logprob'...",talk.politics.misc,0,,
1,gateway telepath modem month actually one woul...,Let's analyze the text step by step:\n\n**Step...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",comp.sys.ibm.pc.hardware,1,,
2,anybody provide advice concerning following tw...,Let's break down the text step by step:\n\n**S...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",sci.med,2,,
3,article mike silverman writes anybody know goi...,Let's analyze the text step by step:\n\n**Step...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",rec.sport.baseball,3,,
4,article stich christian e writes installed mot...,Let's break down the analysis step by step:\n\...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",sci.electronics,4,,
...,...,...,...,...,...,...,...,...
2019,david sternlight writes article karl barrus wr...,Let's analyze the text step by step:\n\n**Step...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",sci.crypt,1995,False,sci.crypt
2020,hello im looking information alphanumeric page...,Let's analyze the text step by step.\n\n**Step...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",sci.electronics,1996,False,sci.electronics
2021,john r daker writes would like offocially nomi...,Let's analyze the text step by step.\n\n1. Sum...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",rec.motorcycles,1997,False,rec.motorcycles
2022,article writes looking information concerning ...,Let's analyze the text step by step.\n\n**Step...,"{'content': [{'token': 'Let', 'bytes': [76, 10...","[{'token': 'Let', 'bytes': [76, 101, 116], 'lo...",sci.space,1998,False,sci.space
