In [5]:
import pandas as pd
import numpy as np
from typing import List, Tuple
import os

In [9]:
main_path = os.path.join(os.getcwd(), 'cnbc_news_20240521000948', 'sk')
xlex_path = os.path.join(main_path, "info_sk.xlsx")
articles_path = os.path.join(main_path, "articles")

df = pd.read_excel(xlex_path)
id_list = df["id"].astype(str).tolist()
file_list = [file.split('.')[0] for file in os.listdir(
    articles_path) if file.endswith(".txt")]
print(f"id list sample: {id_list[:5]}")
print(f"file list sample: {file_list[:5]}")

id list sample: ['107406076', '107396582', '107364147', '107323331', '107314135']
file list sample: ['106965515', '107323331', '107406076', '106977162', '107412516']


In [10]:
print(f"id list length: {len(id_list)}, 중복제거: {len(set(id_list))}")
print(f"file list length: {len(os.listdir(articles_path))}, "
      f"중복제거: {len(set(file_list))}")

id list length: 15, 중복제거: 15
file list length: 15, 중복제거: 15


In [11]:
def compare_lists(list1, list2):
    set1 = set(list1)
    set2 = set(list2)

    common_elements = set1 & set2  # 두 집합의 교집합 (공통 요소)
    # 첫 번째 집합에서 두 번째 집합을 뺀 차집합 (list1에서만 존재하는 요소)
    unique_to_list1 = set1 - set2
    # 두 번째 집합에서 첫 번째 집합을 뺀 차집합 (list2에서만 존재하는 요소)
    unique_to_list2 = set2 - set1

    return {
        "Common": list(common_elements),
        "Only in list1": list(unique_to_list1),
        "Only in list2": list(unique_to_list2)
    }


# 함수 실행 및 결과 출력
result = compare_lists(id_list, file_list)
print(f"공통 요소: ({len(result['Common'])}개)", result["Common"])
print(
    f"id_list에만 존재하는 요소: ({len(result['Only in list1'])}개)", result["Only in list1"])
print(
    f"file_list에만 존재: ({len(result['Only in list2'])}개)", result["Only in list2"])

공통 요소: (15개) ['107406076', '107364147', '107412516', '106965515', '106977162', '107314135', '107094416', '107323331', '107411941', '106947905', '107102994', '106921570', '107396582', '107007756', '106917216']
id_list에만 존재하는 요소: (0개) []
file_list에만 존재: (0개) []


In [6]:
empty_file_list = []

for file in result["Only in list2"]:
    # check if file has empty content
    with open(f"articles/{file}.txt", 'r') as f:
        content = f.read()
        if not content:
            empty_file_list.append(file)

not_empty_file_list = list(set(empty_file_list) - set(result["Only in list2"]))
print(
    f"not empty file list: ({len(result['Only in list1'])}개) {not_empty_file_list}")

not empty file list: (0개) []


In [10]:
# remove row that has duplicated value in df id column
print(f"df length before removing duplicated id: {len(df)}")
df = df.drop_duplicates(subset="id", keep='first')
print(f"df length after removing duplicated id: {len(df)}")
df.to_excel("info_sk.xlsx", index=False)

df length before removing duplicated id: 29
df length after removing duplicated id: 29


In [8]:
# remove if file is empty
def remove_if_file_is_empty(file_list: List[str]) -> None:
    for file in file_list:
        os.remove(file)
        print(f"{file} removed")


remove_if_file_is_empty([f"articles/{file}.txt" for file in empty_file_list])