In [24]:
import pandas as pd

In [25]:
dataset_name = "NIKL_NEWSPAPER_2023_CSV"
fname = "NEWSPAPER_2022_1"

df = pd.read_parquet(f"source/{dataset_name}/{fname}.parquet")
print(df.shape, df.columns)

(103464, 11) Index(['file_id', 'doc_id', 'title', 'author', 'publisher', 'date', 'topic',
       'original_topic', 'sentence_ids', 'sentence_offsets', 'text'],
      dtype='object')


In [26]:
topics = df.topic.unique().tolist()
print(topics)

['정치', '사회', '경제', '스포츠', '생활', '문화', '미용/건강', 'IT/과학', '연예']


In [27]:
topic_ratio = {}

for k,v in df.topic.value_counts().to_dict().items():
    print(k,v)
    topic_ratio[k] = v/df.shape[0]

print(topic_ratio)

사회 33513
경제 16340
정치 15751
생활 15104
스포츠 7564
IT/과학 5248
문화 4048
미용/건강 3819
연예 2077
{'사회': 0.32390976571561125, '경제': 0.15792932807546586, '정치': 0.15223652671460605, '생활': 0.14598314389546121, '스포츠': 0.07310755431841026, 'IT/과학': 0.050722956777236525, '문화': 0.03912471970927086, '미용/건강': 0.036911389468800744, '연예': 0.020074615325137248}


In [28]:
# By-Ratio
total_n = 15000
per_topic_n = {
    k: int(total_n*v) for k,v in topic_ratio.items()
}
print(per_topic_n)

{'사회': 4858, '경제': 2368, '정치': 2283, '생활': 2189, '스포츠': 1096, 'IT/과학': 760, '문화': 586, '미용/건강': 553, '연예': 301}


In [29]:
# # // n
# per_topic_n = {k: total_n//len(topic_ratio) for k in topic_ratio.keys()}
# print(per_topic_n)

In [30]:
sampled_df = []
for topic, count in per_topic_n.items():
    topic_df = df[df.topic==topic].sample(count)
    print(topic, topic_df.shape[0])
    
    sampled_df.append(topic_df)

sampled_df = pd.concat(sampled_df, axis=0)
print(sampled_df.shape)

사회 4858
경제 2368
정치 2283
생활 2189
스포츠 1096
IT/과학 760
문화 586
미용/건강 553
연예 301
(14994, 11)


In [31]:
sampled_df.to_parquet(f"data/{dataset_name}-{fname}-sample1.parquet")

In [32]:
sampled_df.columns

Index(['file_id', 'doc_id', 'title', 'author', 'publisher', 'date', 'topic',
       'original_topic', 'sentence_ids', 'sentence_offsets', 'text'],
      dtype='object')

In [33]:
metadata_df = sampled_df[['file_id', 'doc_id', 'topic', 'title']]
metadata_df.to_csv(f"data/{dataset_name}-{fname}-sample1-metadata.tsv", sep='\t', index=None)