In [1]:
import pandas as pd


In [28]:
import os
import pandas as pd
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def majority_vote_extraction_complete(folder_path, prefix):
    extraction_complete_lists = []
    was_selected_lists = []
    max_len = 0

    for fname in os.listdir(folder_path):
        if fname.lower().endswith('.csv') and fname.startswith(prefix):
            df = pd.read_csv(os.path.join(folder_path, fname))
            if 'extraction_complete' in df.columns and 'was_in_selected_samples' in df.columns:
                series = df['extraction_complete'].fillna('')
                extraction_complete_lists.append(series)
                was_selected_lists.append(df['was_in_selected_samples'])
                max_len = max(max_len, len(series))
            else:
                print(f"Plik {fname} nie zawiera kolumny extraction_complete lub was_in_selected_samples")

    dominant_labels = []
    for idx in range(max_len):
        votes = []
        for series in extraction_complete_lists:
            if idx < len(series):
                val = series.iloc[idx]
                if pd.notna(val) and val != '':
                    try:
                        float_val = float(val)
                        if float_val.is_integer():
                            votes.append(str(int(float_val)).lower())
                        else:
                            votes.append(str(val).lower())
                    except (ValueError, TypeError):
                        votes.append(str(val).lower())
        if votes:
            most_common = Counter(votes).most_common(1)[0][0]
        else:
            most_common = pd.NA
        dominant_labels.append(most_common)

    was_selected = was_selected_lists[0].iloc[:max_len].reset_index(drop=True) if was_selected_lists else pd.Series([False]*max_len)

    return pd.Series(dominant_labels, dtype="object"), was_selected

def get_original_label(folder_path, prefix):
    for fname in os.listdir(folder_path):
        if fname.lower().endswith('.csv') and fname.startswith(prefix):
            df = pd.read_csv(os.path.join(folder_path, fname))
            if 'original_label' in df.columns:
                return df['original_label'].fillna('').astype(str).str.lower().reset_index(drop=True)
    raise ValueError("Nie znaleziono pliku z kolumną original_label")

def compare_with_original(dominant_series, original_series, was_selected):
    min_len = min(len(dominant_series), len(original_series), len(was_selected))

    dom = dominant_series.iloc[:min_len].astype(str).str.lower()
    orig = original_series.iloc[:min_len].astype(str).str.lower()
    selected = was_selected.iloc[:min_len]

    mask = (selected == False) & dom.notna() & orig.notna() & (dom != '') & (orig != '')

    dom_filtered = dom[mask]
    orig_filtered = orig[mask]

    accuracy = accuracy_score(orig_filtered, dom_filtered)
    precision = precision_score(orig_filtered, dom_filtered, average='macro')
    recall = recall_score(orig_filtered, dom_filtered, average='macro')
    f1 = f1_score(orig_filtered, dom_filtered, average='macro')

    return accuracy, precision, recall, f1

# Przykład użycia:
folder = 'extracted_new/results/cot_random_samples_cohere_temp0/llama3'
prefix = 'ag_news'

dominant_series, was_selected = majority_vote_extraction_complete(folder, prefix)
original_series = get_original_label(folder, prefix)
acc, prec, rec, f1 = compare_with_original(dominant_series, original_series, was_selected)

print(f"Accuracy: {acc:.1%}")
print(f"F1 score: {f1:.1%}")
print(f"Recall: {rec:.1%}")
print(f"Precision: {prec:.1%}")


Accuracy: 87.3%
F1 score: 87.1%
Recall: 87.3%
Precision: 87.7%


In [22]:
import os
import pandas as pd

def get_texts_with_inconsistent_extraction(path, prefix):
    files = [f for f in os.listdir(path) if f.startswith(prefix)]
    files.sort()

    dfs = []
    for file in files:
        df = pd.read_csv(os.path.join(path, file))
        dfs.append(df[['text', 'extraction_complete']].reset_index(drop=True))
        
            
    for i, file in enumerate(files, start=1):
        globals()[f'df{i}'] = pd.read_csv(os.path.join(path, file))
    

    # Sprawdź liczbę plików
    if len(dfs) == 0:
        return []

    # Przyjmujemy, że 'text' jest taki sam i w tej samej kolejności w każdym pliku
    texts = dfs[0]['text']

    # Tworzymy DataFrame tylko z kolumn extraction_complete z każdego pliku
    extraction_values = pd.concat([df['extraction_complete'] for df in dfs], axis=1)
    extraction_values.columns = files

    # Znajdź indeksy wierszy, gdzie extraction_complete różni się między plikami
    inconsistent_mask = extraction_values.nunique(axis=1) > 2

    # Zwróć teksty, które mają niezgodności
    inconsistent_texts = texts[inconsistent_mask].tolist()

    return inconsistent_texts


In [None]:
len(get_texts_with_inconsistent_extraction(folder, prefix))

In [None]:
df1.head()


Unnamed: 0,text,output,logprobs,top_logprobs,original_label,was_in_selected_samples,extracted_label,extracted_label_by_llm,extraction_complete
0,Title: i have a three month old baby and she g...,[Health],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.7...",Health,False,Health,,Health
1,Title: i want to view waec2006 question paper?...,[Education & Reference],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.4...",Science & Mathematics,False,Education & Reference,,Education & Reference
2,"Title: (6a squared+1)+(8a squared + 3a + 2=?, ...",[Science & Mathematics],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.1...",Science & Mathematics,False,Science & Mathematics,,Science & Mathematics
3,"Title: Robert F. Kennedy?, Content: I need him...",[Politics & Government],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -1.1...",Politics & Government,False,Politics & Government,,Politics & Government
4,"Title: I'm trying not to call my ex...!?, Cont...",Summarize the text: The text is about the post...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Family & Relationships,False,Family & Relationships,,Family & Relationships


In [None]:
df2.head()

Unnamed: 0,text,output,logprobs,top_logprobs,original_label,was_in_selected_samples,extracted_label,extracted_label_by_llm,extraction_complete
0,Title: i have a three month old baby and she g...,[Health],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.8...",Health,False,Health,,Health
1,Title: i want to view waec2006 question paper?...,[Education & Reference],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.4...",Science & Mathematics,False,Education & Reference,,Education & Reference
2,"Title: (6a squared+1)+(8a squared + 3a + 2=?, ...",[Science & Mathematics],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.1...",Science & Mathematics,False,Science & Mathematics,,Science & Mathematics
3,"Title: Robert F. Kennedy?, Content: I need him...",Summarize the text: The text is about Robert F...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Politics & Government,False,Politics & Government,,Politics & Government
4,"Title: I'm trying not to call my ex...!?, Cont...",Summarize the text: The text is about the post...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Family & Relationships,False,Family & Relationships,,Family & Relationships


In [None]:
df3.head()

Unnamed: 0,text,output,logprobs,top_logprobs,original_label,was_in_selected_samples,extracted_label,extracted_label_by_llm,extraction_complete
0,Title: i have a three month old baby and she g...,[Health],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.8...",Health,False,Health,,Health
1,Title: i want to view waec2006 question paper?...,[Education & Reference],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.4...",Science & Mathematics,False,Education & Reference,,Education & Reference
2,"Title: (6a squared+1)+(8a squared + 3a + 2=?, ...",[Science & Mathematics],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.1...",Science & Mathematics,False,Science & Mathematics,,Science & Mathematics
3,"Title: Robert F. Kennedy?, Content: I need him...",[Politics & Government],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.8...",Politics & Government,False,Politics & Government,,Politics & Government
4,"Title: I'm trying not to call my ex...!?, Cont...",Summarize the text: The text is about someone ...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Family & Relationships,False,Family & Relationships,,Family & Relationships


In [None]:
df4.head()

Unnamed: 0,text,output,logprobs,top_logprobs,original_label,was_in_selected_samples,extracted_label,extracted_label_by_llm,extraction_complete
0,Title: i have a three month old baby and she g...,The text is about a parent seeking advice on h...,"{'content': [{'token': 'The', 'bytes': [84, 10...","[{'token': 'The', 'bytes': [84, 104, 101], 'lo...",Health,False,Health,,Health
1,Title: i want to view waec2006 question paper?...,[Education & Reference],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.4...",Science & Mathematics,False,Education & Reference,,Education & Reference
2,"Title: (6a squared+1)+(8a squared + 3a + 2=?, ...",[Science & Mathematics],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.1...",Science & Mathematics,False,Science & Mathematics,,Science & Mathematics
3,"Title: Robert F. Kennedy?, Content: I need him...",Summarize the text: The text is about Robert F...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Politics & Government,False,Politics & Government,,Politics & Government
4,"Title: I'm trying not to call my ex...!?, Cont...",Summarize the text: The text is about the chal...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Family & Relationships,False,Family & Relationships,,Family & Relationships


In [None]:
df5.head()

Unnamed: 0,text,output,logprobs,top_logprobs,original_label,was_in_selected_samples,extracted_label,extracted_label_by_llm,extraction_complete
0,Title: i have a three month old baby and she g...,[Health],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.8...",Health,False,Health,,Health
1,Title: i want to view waec2006 question paper?...,[Education & Reference],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.4...",Science & Mathematics,False,Education & Reference,,Education & Reference
2,"Title: (6a squared+1)+(8a squared + 3a + 2=?, ...",[Science & Mathematics],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.1...",Science & Mathematics,False,Science & Mathematics,,Science & Mathematics
3,"Title: Robert F. Kennedy?, Content: I need him...",[Politics & Government],"{'content': [{'token': '[', 'bytes': [91], 'lo...","[{'token': '[', 'bytes': [91], 'logprob': -0.8...",Politics & Government,False,Politics & Government,,Politics & Government
4,"Title: I'm trying not to call my ex...!?, Cont...",Summarize the text: The text is about someone ...,"{'content': [{'token': 'Summ', 'bytes': [83, 1...","[{'token': 'Summ', 'bytes': [83, 117, 109, 109...",Family & Relationships,False,Family & Relationships,,Family & Relationships
