In [21]:
import os
import pandas as pd

from utils.io import _is_dir, _is_file, read_tabular

## Labeled data

In [30]:
base_path = '..'
data_path = os.path.join(base_path, 'data', 'labeled')

desc = []
for d in os.listdir(data_path):
    d = os.path.join(data_path, d)
    if not _is_dir(d):
        continue
    for f in os.listdir(d):
        fp = os.path.join(d, f)
        if _is_file(fp) and any([fp.endswith(ext) for ext in ['.csv', '.tsv', '.tab']]):
            try:
                df = read_tabular(fp)
            except Exception as e:
                print(f'Error reading {fp}: {e}')
            else:
                stats = {
                    'file': fp,
                    'n_rows': len(df),
                    'label_counts': df.label.value_counts(normalize=True).round(2).to_dict(),
                    'n_metadata_cols': len([c for c in df.columns if c.startswith('metadata__')]),
                }
                desc.append(stats)

In [31]:
datasets_overview = pd.DataFrame(desc)

In [38]:
datasets_overview[['_', 'paper_id', 'dataset_id']] = datasets_overview.file.str.removeprefix(data_path).str.split('/', expand=True)

In [42]:
datasets_overview = datasets_overview[['paper_id', 'dataset_id', 'n_rows', 'label_counts', 'n_metadata_cols', 'file']]

In [44]:
datasets_overview['dataset_id'] = datasets_overview.dataset_id.str.removesuffix('.csv').str.removesuffix('.tsv').str.removesuffix('.tab')

In [46]:
datasets_overview

Unnamed: 0,paper_id,dataset_id,n_rows,label_counts,n_metadata_cols,file
0,gilardi_chatgpt_2023,gilardi_chatgpt_2023-content_moderation_frame,1156,"{'Neither': 0.67, 'Solution': 0.16, 'Problem':...",0,../data/labeled/gilardi_chatgpt_2023/gilardi_c...
1,gilardi_chatgpt_2023,gilardi_chatgpt_2023-section230_stance,780,"{'neutral': 0.54, 'negative': 0.42, 'positive'...",0,../data/labeled/gilardi_chatgpt_2023/gilardi_c...
2,gilardi_chatgpt_2023,gilardi_chatgpt_2023-content_moderation_relevance,1315,"{0: 0.53, 1: 0.47}",0,../data/labeled/gilardi_chatgpt_2023/gilardi_c...
3,miller_active_2020,miller_active_2020-tweets_refugeetopic,24255,"{0: 0.97, 1: 0.03}",2,../data/labeled/miller_active_2020/miller_acti...
4,miller_active_2020,miller_active_2020-news_articletag_muslim_iden...,23883,"{0: 0.91, 1: 0.09}",11,../data/labeled/miller_active_2020/miller_acti...
5,barbera_automated_2021,barbera_automated_2021-econ_news_sentiment,420,"{'negative': 0.61, 'positive': 0.39}",3,../data/labeled/barbera_automated_2021/barbera...
6,bonikowski_politics_2022,bonikowski_politics_2022-campaignspeech_populism,70946,"{0: 0.97, 1: 0.03}",17,../data/labeled/bonikowski_politics_2022/bonik...
7,bonikowski_politics_2022,bonikowski_politics_2022-campaignspeech_nation...,70946,"{0: 0.91, 1: 0.09}",17,../data/labeled/bonikowski_politics_2022/bonik...
8,bonikowski_politics_2022,bonikowski_politics_2022-campaignspeech_nation...,70946,"{0: 0.96, 1: 0.04}",17,../data/labeled/bonikowski_politics_2022/bonik...
9,bonikowski_politics_2022,bonikowski_politics_2022-campaignspeech_author...,70945,"{0: 0.98, 1: 0.02}",17,../data/labeled/bonikowski_politics_2022/bonik...
