In [1]:
import os
import pandas as pd
import json
import re
from pathlib import Path

pd.set_option('max_colwidth', 400)

In [2]:
df_reg = pd.read_csv('csv/regulations.csv')
# df_reg.head()

In [3]:
df = pd.read_csv('csv/data_file.csv')
# df.head()
# print(len(df))

In [4]:
df2 = pd.read_csv('csv/data_detail.csv')
# df2.head()

In [5]:
duplicate_titles = df2[df2.duplicated(subset='title', keep=False)]
# duplicate_titles

In [5]:
df3 = df.merge(df2, how='left', on='title')

In [6]:
df3['type_of_regulation'].value_counts()

type_of_regulation
Bank Indonesia Circular Letters                391
Bank Indonesia Regulation                      339
Member Of The Board Of Governors Regulation     27
Name: count, dtype: int64

In [166]:
# df3.loc[df3['type_of_regulation'] == 'Bank Indonesia Circular Letters'].tail()

In [57]:
# df3.loc[df3['file_name'] == "Appendix"]

# Standardizing File Names

In [16]:
# Function to convert type_of_regulation to its abbreviation
def format_type_of_regulation(regulation_type):
    mapping = {
        "Bank Indonesia Regulation": "pbi",
        "Member Of The Board Of Governors Regulation": "padg",
        "Bank Indonesia Circular Letters": "sebi"
    }
    return mapping.get(regulation_type, "unknown")

# Function to extract the number from title and type_of_regulation
def format_number(title, type_of_regulation):
    # Check for pattern a/b/c/d (three slashes)
    match = re.search(r'\d+/\d+/\w+/\d+', title)
    if match:
        return format_title(match.group(0))
    
    # Check for pattern x/y/z (two slashes)
    match = re.search(r'\d+/\d+/\w+', title)
    if match:
        return format_title(match.group(0))

    # Combined regex pattern for all types
    match = re.search(r'(NUMBER[:.]?|NO\.?|NOMOR|NR\.?) ?(\d+(?:/\d+)*(/\w+)*/\d+|\d+)', title, re.IGNORECASE)
    
    if match:
        number = match.group(2)
        return format_title(number)
    
    # Special case for 'Bank Indonesia Regulation' with additional year handling
    if type_of_regulation == 'Bank Indonesia Regulation':
        match_number = re.search(r'NUMBER (\d+(?:/\d+)*(/\w+)*/\d+|\d+) (?:OF )?(\d+)?', title, re.IGNORECASE)
        if match_number:
            if match_number.group(3):
                number = match_number.group(1) + match_number.group(3)
            else:
                number = match_number.group(1)
            return format_title(number)
    
    return "unknown"

# Function to format date
def format_date(date_str):
    months = {
        "January": "01", "February": "02", "March": "03",
        "April": "04", "May": "05", "June": "06",
        "July": "07", "August": "08", "September": "09",
        "October": "10", "November": "11", "December": "12"
    }
    day, month, year = date_str.split()
    return f"{int(day):02d}{months[month]}{year}"

# Function to format title
def format_title(title):
    formatted_title = title.lower().replace(' ', '_').replace('/', '_').replace('-', '_')
    formatted_title = re.sub(r'[^a-z0-9_]', '', formatted_title)
    return formatted_title[:250]

# Function to generate the standardized file name
def generate_standardized_file_name(row):
    regulation_type = format_type_of_regulation(row['type_of_regulation'])
    number = format_number(row['title'], row['type_of_regulation'])
    date = format_date(row['date'])
    title = format_title(row['title'])
    extension = row['file_link'].split('.')[-1]
    return f"{regulation_type}-{number}-{date}-{title}.{extension}"

df3['standardized_file_name'] = df3.apply(generate_standardized_file_name, axis=1)

In [19]:
unknown_rows = df3[df3['standardized_file_name'].str.contains("unknown", na=False)]
unknown_rows.loc[unknown_rows['type_of_regulation'] == 'Member Of The Board Of Governors Regulation']['title']

Series([], Name: title, dtype: object)

In [21]:
df3_dict = df3.to_dict('records')
with open('metadata.json', 'w') as file:
    json.dump(df3_dict, file)
df3.to_csv('csv/data_final.csv', index=False)

# OS Stuff

In [10]:
_, _, files = next(os.walk("files/"))
file_count = len(files)
print(file_count)

757


In [11]:
_, _, files = next(os.walk("extracted_files/"))
file_count = len(files)
print(file_count)

238


In [12]:
def get_folder_size(folder):
    return ByteSize(sum(file.stat().st_size for file in Path(folder).rglob('*')))


class ByteSize(int):

    _KB = 1024
    _suffixes = 'B', 'KB', 'MB', 'GB', 'PB'

    def __new__(cls, *args, **kwargs):
        return super().__new__(cls, *args, **kwargs)

    def __init__(self, *args, **kwargs):
        self.bytes = self.B = int(self)
        self.kilobytes = self.KB = self / self._KB**1
        self.megabytes = self.MB = self / self._KB**2
        self.gigabytes = self.GB = self / self._KB**3
        self.petabytes = self.PB = self / self._KB**4
        *suffixes, last = self._suffixes
        suffix = next((
            suffix
            for suffix in suffixes
            if 1 < getattr(self, suffix) < self._KB
        ), last)
        self.readable = suffix, getattr(self, suffix)

        super().__init__()

    def __str__(self):
        return self.__format__('.2f')

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, super().__repr__())

    def __format__(self, format_spec):
        suffix, val = self.readable
        return '{val:{fmt}} {suf}'.format(val=val, fmt=format_spec, suf=suffix)

    def __sub__(self, other):
        return self.__class__(super().__sub__(other))

    def __add__(self, other):
        return self.__class__(super().__add__(other))
    
    def __mul__(self, other):
        return self.__class__(super().__mul__(other))

    def __rsub__(self, other):
        return self.__class__(super().__sub__(other))

    def __radd__(self, other):
        return self.__class__(super().__add__(other))
    
    def __rmul__(self, other):
        return self.__class__(super().__rmul__(other))   
    
size = get_folder_size("")
print(size)

707.31 MB
