In [11]:
import os
import pandas as pd
from collections import defaultdict

import os
import pandas as pd
from collections import defaultdict

import os
import pandas as pd

def find_inconsistent_texts_for_dataset(folder_path, prefix):
    """
    Znajduje unikalne teksty w plikach CSV rozpoczynających się od danego prefiksu,
    które mają różne wartości w kolumnie 'extraction_complete'.

    Parameters:
    - folder_path (str): Ścieżka do folderu zawierającego pliki CSV.
    - prefix (str): Prefiks identyfikujący pliki danego datasetu.

    Returns:
    - pd.Series: Seria zawierająca unikalne teksty z niespójnymi wartościami 'extraction_complete'.
    """
    filepaths = [
        os.path.join(folder_path, filename)
        for filename in os.listdir(folder_path)
        if filename.startswith(prefix) and filename.endswith('.csv')
    ]

    if not filepaths:
        print(f"Brak plików dla prefiksu: {prefix}")
        return pd.Series(dtype=str)

    df_list = []
    for fp in filepaths:
        df = pd.read_csv(fp)

        if 'extraction_complete' not in df.columns:
            print(f"Brak kolumny 'extraction_complete' w pliku: {fp}")
            continue

        if prefix == 'sst5':
            # Tylko tam, gdzie się da: cyfry -> int -> str
            df['extraction_complete'] = df['extraction_complete'].astype(str)
            mask_numeric = df['extraction_complete'].str.strip().str.fullmatch(r'\d+')
            df.loc[mask_numeric, 'extraction_complete'] = (
                df.loc[mask_numeric, 'extraction_complete']
                .astype(int)
                .astype(str)
            )
            # Pozostałe wartości zostają jako string
        else:
            df['extraction_complete'] = df['extraction_complete'].astype(str)

        df_list.append(df)

    if not df_list:
        print("Brak poprawnych danych.")
        return pd.Series(dtype=str)

    combined_df = pd.concat(df_list, ignore_index=True)

    grouped = combined_df.groupby('text')['extraction_complete'].nunique()
    inconsistent_texts = grouped[grouped > 1].index.tolist()

    return pd.Series(inconsistent_texts, name='text')




In [12]:
dataset_tag = '20_newsgroups'
model = 'cohere'
# model = 'llama3'
# model = 'gpt4omini'
N = 80

entropy_path = f'entropy/{model}/{dataset_tag}_{model}_no_samples_numbers_temp0_exp1.csv'

In [13]:
pathh = f'../COT/extracted_new/results/cot_random_samples_cohere_temp0.3/{model}'

if(model=='gpt4omini'):
    pathh = f'../COT/extracted_new/results/cot_random_samples_temp0.3/{model}'


In [14]:
entropy_df = pd.read_csv(entropy_path)
entropy_df

Unnamed: 0.1,Unnamed: 0,text,original_label,normalized_entropy
0,0,article ashish arora writes excerpts netnewssc...,talk.politics.misc,0.223335
1,1,gateway telepath modem month actually one woul...,comp.sys.ibm.pc.hardware,0.728170
2,2,anybody provide advice concerning following tw...,sci.med,0.018344
3,3,article mike silverman writes anybody know goi...,rec.sport.baseball,0.253000
4,4,article stich christian e writes installed mot...,sci.electronics,0.667292
...,...,...,...,...
1995,1995,david sternlight writes article karl barrus wr...,sci.crypt,0.350993
1996,1996,hello im looking information alphanumeric page...,sci.electronics,0.505037
1997,1997,john r daker writes would like offocially nomi...,rec.motorcycles,0.715838
1998,1998,article writes looking information concerning ...,sci.space,0.088816


In [15]:
results = find_inconsistent_texts_for_dataset(pathh, dataset_tag)
results


0      accounts antiarmenian human right violatins az...
1      accounts antiarmenian human right violations a...
2      actual algorithm classified however main thrus...
3      actually synth used jump oberheim watch video ...
4      actually two us henry fred tommy mary oh yeah ...
                             ...                        
644    xmotif gurus handling scaling x text performin...
645                    yawn church kibology first better
646    yes take interstate route exit go south miles ...
647    yet escape sequences speaking non standard dan...
648    youre willing little work make drawnbuttons wa...
Name: text, Length: 649, dtype: object

In [16]:
import numpy as np

def select_mixed_examples_grouped(entropy_df, inconsistent_texts, N=100):
    assert N % 2 == 0, "N powinno być parzyste (dzielone przez 2)."

    num_groups = N
    group_size = len(entropy_df) // num_groups

    # Posortuj po rosnącej entropii
    sorted_df = entropy_df.sort_values(by='normalized_entropy').reset_index(drop=True)

    # Podziel na grupy
    groups = [sorted_df.iloc[i*group_size:(i+1)*group_size] for i in range(num_groups)]

    # Pomiń 10% pierwszych grup (najbardziej pewne)
    skip_n = max(1, int(num_groups * 0.1))  # zawsze przynajmniej jedną pominąć
    selected_groups = groups[skip_n:]

    # Z każdej grupy wybierz 1 przykład
    selected_from_groups = [group.sample(n=1, random_state=11) for group in selected_groups]
    selected_df = pd.concat(selected_from_groups, ignore_index=True)

    # Przygotuj zbiór niespójnych
    inconsistent_set = set(inconsistent_texts)
    inconsistent_df = entropy_df[entropy_df['text'].isin(inconsistent_set)]
    inconsistent_df = inconsistent_df.sort_values(by='normalized_entropy', ascending=False)

    # Ile trzeba jeszcze uzupełnić
    remaining = N - len(selected_df)
    uncertain_df = inconsistent_df.head(remaining)

    # Połącz oba zbiory
    final_df = pd.concat([selected_df, uncertain_df], ignore_index=True)

    # Zmień nazwę kolumny
    final_df = final_df.rename(columns={'original_label': 'label'})

    return final_df[['text', 'label', 'normalized_entropy']]



In [17]:
final_df = select_mixed_examples_grouped(entropy_df, results, N)


In [18]:
final_df

Unnamed: 0,text,label,normalized_entropy
0,article robert weiss writes gordon banks quote...,talk.religion.misc,0.020617
1,disclaimer fun computerized baseball game keep...,rec.sport.baseball,0.023588
2,driftwood writes totally agree point made jose...,rec.sport.baseball,0.027211
3,plan toronto area sometime summer late june ea...,rec.sport.baseball,0.033497
4,rick anderson replied letter ra article ra rob...,talk.religion.misc,0.039581
...,...,...,...
75,directory pd msdossysutl filename type length ...,comp.sys.ibm.pc.hardware,0.925905
76,article gerard odriscoll writes writes widget ...,comp.windows.x,0.899421
77,bennett todd salomon brothers inc ny wrote how...,sci.crypt,0.899076
78,nd call presentations navy scientific visualiz...,comp.graphics,0.894965


In [19]:
final_df[['text', 'label']].to_csv(f'selected_samples/{model}/{dataset_tag}.csv')

In [20]:
len(final_df['label'].unique())

20