In [1]:
import pandas as pd
from urllib.parse import urlparse
import os
import gc


In [2]:
def load_or_create_csv(file_path, columns):
    return pd.read_csv(file_path) if os.path.exists(file_path) else pd.DataFrame(columns=columns)


In [3]:
output_file = "unique_sites_count.csv"
output_file_trusted = "../data/trusted_sites.csv"
trusted_owners = {"ТСН", "Суспільне | Новини", "Лачен пише", "Bihus.Info", "BBC News Україна", "BBC NEWS Україна"}

In [None]:
file_list = [os.path.join("..", "data", f"news_part_{i}.csv") for i in range(1, 24)]
site_counts = load_or_create_csv(output_file, ["site", "owner", "count"])
trusted_sites= load_or_create_csv(output_file_trusted, ["url", "owner", "text"])

In [None]:
for i, file in enumerate(file_list):
    df = pd.read_csv(file)
    df = df.dropna()
    df["site"] = df["url"].apply(lambda x: urlparse(x).netloc)

    df_counts = df[["site", "owner"]].value_counts().reset_index()
    df_counts.columns = ["site", "owner", "count"]
    
    site_counts = pd.concat([site_counts, df_counts])
    site_counts = site_counts.groupby(["site", "owner"], as_index=False).sum()
    
    site_counts = site_counts.dropna().sort_values(by="count", ascending=False)

    site_counts.to_csv(output_file, index=False)
    print(f"Processed {file}")
    
    del df, df_counts
    gc.collect()
    
print("Data preprocessing completed")

Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_1.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_2.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_3.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_4.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_5.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_6.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_7.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_8.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_9.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_10.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_11.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_12.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_13.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_14.csv
Processed D:/Лаби/3 курс/2 семестр/МЛ/Проект/data/news_part_15.csv
Proc

## Collect news from trusted sites

In [12]:
for i, file in enumerate(file_list):
        df = pd.read_csv(file)
        df = df.dropna()

        trusted_sites = df[df["owner"].isin(trusted_owners)][["url", "owner", "text"]]
        trusted_sites.to_csv(output_file_trusted, sep=";", encoding="utf-8", index=False, mode="a", header=False)
        
        print(f"Processed {file}")
        gc.collect()
        del df, trusted_sites

print("Фільтрація завершена, всі файли оброблено.")

Processed ..\data\news_part_1.csv
Processed ..\data\news_part_2.csv
Processed ..\data\news_part_3.csv
Processed ..\data\news_part_4.csv
Processed ..\data\news_part_5.csv
Processed ..\data\news_part_6.csv
Processed ..\data\news_part_7.csv
Processed ..\data\news_part_8.csv
Processed ..\data\news_part_9.csv
Processed ..\data\news_part_10.csv
Processed ..\data\news_part_11.csv
Processed ..\data\news_part_12.csv
Processed ..\data\news_part_13.csv
Processed ..\data\news_part_14.csv
Processed ..\data\news_part_15.csv
Processed ..\data\news_part_16.csv
Processed ..\data\news_part_17.csv
Processed ..\data\news_part_18.csv
Processed ..\data\news_part_19.csv
Processed ..\data\news_part_20.csv
Processed ..\data\news_part_21.csv
Processed ..\data\news_part_22.csv
Processed ..\data\news_part_23.csv
Фільтрація завершена, всі файли оброблено.


### Split trusted news file into smaller (each 98mb)

In [8]:
import pandas as pd
import os

output_dir = "data/splited_trusted_news"
os.makedirs(output_dir, exist_ok=True)

df = pd.read_csv(output_file_trusted, sep=";").sample(frac=1, random_state=42).reset_index(drop=True)

chunk_size = 99 * 1024 * 1024  
rows_per_chunk = int(chunk_size / (df.memory_usage(deep=True).sum() / len(df)))

for i, start in enumerate(range(0, len(df), rows_per_chunk)):
    df.iloc[start:start + rows_per_chunk].to_csv(f"{output_dir}/trusted_news_part_{i+1}.csv", sep=";", index=False)
    print(f"Saved chunk {i+1}")

print("Data split completed")


Saved chunk 1
Saved chunk 2
Saved chunk 3
Saved chunk 4
Saved chunk 5
Saved chunk 6
Saved chunk 7
Saved chunk 8
Saved chunk 9
Data split completed
