In [3]:
import os
import pandas as pd
from collections import defaultdict
import re

# Base input path
base_input_folder = 'input/cyberlib'

# Initialize dictionary to store results
results = {}

# Loop through years
for year in range(2021, 2026):
    year_path = os.path.join(base_input_folder, str(year))
    if not os.path.exists(year_path):
        continue

    print(f"\nProcessing year: {year}")
    monthly_counts = defaultdict(int)

    for filename in os.listdir(year_path):
        if not filename.endswith('.xls'):
            continue
        if not re.match(rf'^cyberlib_raw_{year}-\d{{2}}.*\.xls$', filename):
            continue

        match = re.match(rf'^cyberlib_raw_{year}-(\d{{2}})', filename)
        if match:
            month = match.group(1)
            file_path = os.path.join(year_path, filename)

            # Try reading as Excel first, then HTML fallback
            try:
                df = pd.read_excel(file_path, engine='xlrd')
            except Exception as e1:
                try:
                    df_list = pd.read_html(file_path)
                    df = df_list[0]
                except Exception as e2:
                    print(f"❌ Failed to read {filename}: {e2}")
                    continue  # Skip this file

            row_count = len(df)
            monthly_counts[month] += row_count

    results[year] = dict(sorted(monthly_counts.items()))

# Print final summary
for year, months in results.items():
    print(f"\n📅 Year: {year}")
    for month, count in sorted(months.items()):
        print(f"  Month {month}: {count} rows")


Processing year: 2021

Processing year: 2022

Processing year: 2023

Processing year: 2024

Processing year: 2025

📅 Year: 2021
  Month 01: 23895 rows
  Month 02: 22904 rows
  Month 03: 27701 rows
  Month 04: 26166 rows
  Month 05: 21324 rows
  Month 06: 25400 rows
  Month 07: 26485 rows
  Month 08: 27926 rows
  Month 09: 28874 rows
  Month 10: 28587 rows
  Month 11: 26889 rows
  Month 12: 25629 rows

📅 Year: 2022
  Month 01: 25005 rows
  Month 02: 22936 rows
  Month 03: 27615 rows
  Month 04: 24693 rows
  Month 05: 16195 rows
  Month 06: 18865 rows
  Month 07: 19206 rows
  Month 08: 19515 rows
  Month 09: 24675 rows
  Month 10: 23941 rows
  Month 11: 26064 rows
  Month 12: 22486 rows

📅 Year: 2023
  Month 01: 22897 rows
  Month 02: 23186 rows
  Month 03: 23613 rows
  Month 04: 15934 rows
  Month 05: 18327 rows
  Month 06: 16273 rows
  Month 07: 21672 rows
  Month 08: 21389 rows
  Month 09: 20168 rows
  Month 10: 22701 rows
  Month 11: 22678 rows
  Month 12: 21432 rows

📅 Year: 2024
 

In [4]:
# Print final summary
for year, months in results.items():
    print(f"\n📅 Year: {year}")
    for month, count in sorted(months.items()):
        print(f"  Month {month}: {count} news")


📅 Year: 2021
  Month 01: 23895 news
  Month 02: 22904 news
  Month 03: 27701 news
  Month 04: 26166 news
  Month 05: 21324 news
  Month 06: 25400 news
  Month 07: 26485 news
  Month 08: 27926 news
  Month 09: 28874 news
  Month 10: 28587 news
  Month 11: 26889 news
  Month 12: 25629 news

📅 Year: 2022
  Month 01: 25005 news
  Month 02: 22936 news
  Month 03: 27615 news
  Month 04: 24693 news
  Month 05: 16195 news
  Month 06: 18865 news
  Month 07: 19206 news
  Month 08: 19515 news
  Month 09: 24675 news
  Month 10: 23941 news
  Month 11: 26064 news
  Month 12: 22486 news

📅 Year: 2023
  Month 01: 22897 news
  Month 02: 23186 news
  Month 03: 23613 news
  Month 04: 15934 news
  Month 05: 18327 news
  Month 06: 16273 news
  Month 07: 21672 news
  Month 08: 21389 news
  Month 09: 20168 news
  Month 10: 22701 news
  Month 11: 22678 news
  Month 12: 21432 news

📅 Year: 2024
  Month 01: 20824 news
  Month 02: 17196 news
  Month 03: 26154 news
  Month 04: 16456 news
  Month 05: 18972 news
 

In [2]:
import os
import pandas as pd
from collections import defaultdict
import re

# Path file
file_path = 'input/cyberlib/2023/cyberlib_raw_2023-11-01_2023-11-30.xls'

# Coba baca file sebagai Excel, jika gagal coba baca sebagai HTML
try:
    dfread = pd.read_excel(file_path, engine='xlrd')
    print("✅ Berhasil membaca file sebagai Excel.")
except Exception as e1:
    try:
        dfread = pd.read_html(file_path)[0]  # Ambil hanya tabel pertama
        print("⚠️ Gagal baca sebagai Excel, tapi berhasil sebagai HTML.")
    except Exception as e2:
        print("❌ Gagal membaca file sebagai Excel maupun HTML.")
        print("🔹 Error Excel:", e1)
        print("🔹 Error HTML:", e2)
        dfread = None

⚠️ Gagal baca sebagai Excel, tapi berhasil sebagai HTML.


In [3]:
print(dfread.head())

   No         Date       Media   Author    Page  \
0   1  28 Nov 2023  okezonecom  Okezone  Online   
1   2  28 Nov 2023     Waspada      NaN      B8   
2   3  28 Nov 2023     Waspada    (hO2)      B6   
3   4  28 Nov 2023     Waspada    (cpb)      B3   
4   5  28 Nov 2023     Waspada    (m22)      B2   

                                               Title  \
0  Sederet Penyanyi Lokal dan Internasional akan ...   
1           Pemkab DS Percepat Akses Keuangan Daerah   
2  Peringatan HGN Di Al Azhar Dan SMP Ar Rahman P...   
3   Pemprovsu Diminta Percepat Pembangunan Di Daerah   
4  PGMI Sumut Beri Penghargaan Kepada Guru Honor ...   

                                             Content  Colomn  
0  SPOTIFY Wrapped Live Indonesia 2023 untuk pert...     NaN  
1  Pemkab DS Percepat Akses Keuangan Daerah LIJBU...     3.0  
2  Peringatan HGN Di Al Azhar Dan SMP Ar Rahman P...     4.0  
3  Pemprovsu Diminta Percepat Pembangunan Di Daer...     3.0  
4  PGMI Sumut Beri Penghargaan Kepada Gur

In [4]:
# Menampilkan nilai unik pada kolom 'Media'
unique_media = dfread['Media'].unique()

# Menampilkan jumlah media yang unik
count_unique_media = dfread['Media'].nunique()

# Output hasil
print("🎯 Daftar media unik:")
print(unique_media)

print(f"\n📊 Jumlah total media unik: {count_unique_media}")

🎯 Daftar media unik:
['okezonecom' 'Waspada' 'infobanknews.com' 'liputan6com' 'jpnn.com'
 'Detikcom' 'Antaranewscom / LKBN Antara' 'Bisnis.com'
 'jurnalnasional.com' 'iqplus.info' 'dunia-energi.com' 'Jawa Pos' 'Kompas'
 'The Jakarta Post' 'Rakyat Merdeka' 'Harian Ekonomi Neraca'
 'Investor Daily Indonesia' 'Harian Kontan' 'Pikiran Rakyat'
 'Media Indonesia' 'Koran Jakarta' 'Bisnis Indonesia' 'Kedaulatan Rakyat'
 'Harian Singgalang' 'Tabloid Kontan']

📊 Jumlah total media unik: 25
