
Notebooks below are used to run in local machine

Data cleansing process:
1. Pandas will read the excel file (xlsx)
2. Drop null data (usually the first row for each object)
3. Loop through each column to match data with new line '`n' and replace them with ',' (comma)
4. Concat the list of DataFrame into one DF per object
5. Get the column names (from the concatenated DF) and rename them (using the function `rename_column`)
6. Save the final DF as csv and set the renamed columns as the new header

In [0]:
year = "2019"
directory = f"path_here\\{year}"
csv_path_base = f"path_here\\{year}\\<obj>-bordereaux-{year}.csv"

In [0]:
import pandas as pd

def rename_column(columns):
    """
    Renames the column names to a standardized format.
    
    Args:
        columns (list): List of column names to be renamed.
        
    Returns:
        list: Transformed column names.
    """
    
    # Initialize an empty list to hold the renamed column names
        
    exist = []
    renamed_column = []

    for col in columns:
        new_col =  ""

        if col == "TransactionType" or col == "Transaction Type":
            new_col = "transaction_type"
        #
        elif col == "Policy #" or col == "Policy Number" or col == "PolicyNumber" or col == "PolicyNumber" or col == "Policy":
            new_col = "policy_number"
        #
        elif col == "Previous Policy" or col == "PriorPolicyNumber":
            new_col = "prior_policy_number"
        #
        elif col == "Policy Effective Date" or col == "InceptionDate":
            new_col = "policy_effective_date"
        #
        elif col == "Expiration Date" or col == "ExpirationDate":
            new_col = "policy_expiration_date"
        #
        elif col == "Transaction Effective Date" or col == "TransactionEffectiveDate":
            new_col = "transaction_effective_date"
        #
        elif col == "EndorsementNumber":
            new_col = "endorsement_number"
        #
        elif col == "RateMethod":
            new_col = "rate_method"
        elif col == "RateMod":
            new_col = "rate_mod"
        #
        elif col == "StatusTypeDescription":
            new_col = "status_type_description"
        elif col == "PolicyTypeDescription":
            new_col = "policy_type_description"
        elif col == "RatingCompany":
            new_col = "rating_company"
        elif col == "InsuredName":
            new_col = "insured_name"
        elif col == "AddressLine1":
            new_col = "address_line_1"
        elif col == "AddressLine2":
            new_col = "address_line_2"
        elif col == "ZipCode":
            new_col = "zip_code"
        elif col == "MGUCode":
            new_col = "mgu_code"
        elif col == "AuditFreq":
            new_col = "audit_freq"
        #
        elif col == "CommissionPerc" or col == "CommissionPrec":
            new_col = "commission_percentage"
        #
        elif col == "GrossPremium":
            new_col = "gross_premium"
        elif col == "OccLmt":
            new_col = "occ_lmt"
        elif col == "AggLmt":
            new_col = "agg_lmt"
        elif col == "ClassCode":
            new_col = "class_code"
        elif col == "PolType":
            new_col = "pol_type"
        elif col == "ProRate":
            new_col = "pro_rate"
        #
        elif col == "#RatEmp":
            new_col = "num_rat_emp"
        #
        elif col == "TotEmp":
            new_col = "tot_emp"
        elif col == "CoverageDescription":
            new_col = "coverage_description"
        elif col == "PolicyNumber":
            new_col = "policy_number"
        elif col == "ChangeLongDesc":
            new_col = "change_long_desc"
        elif col == "RetainedLimit":
            new_col = "retained_limit"
        elif col == "WrittenTax":
            new_col = "written_tax"
        elif col == "NIPCode":
            new_col = "nip_code"
        elif col == "CoveragePremium":
            new_col = "coverage_premium"
        # commercial auto
        elif col == "MAWaiver":
            new_col = "ma_waiver"
        elif col == "RCIndicator":
            new_col = "rc_indicator"
        else:
            new_col = col.strip().replace("#", "num_").replace(" (see tab)","").replace(" (SEE TAB)","").replace(" - ", "_").replace(" -", "_") \
                .replace("- ", "_").replace("-", "_").replace("(","").replace(")","").replace("%", "perc").strip().lower() \
                .replace(" &", "_AND_").replace(" / ", "_OR_").replace("/", "_OR_").replace(" /", "_OR_").replace("'", "").replace("/ ", "_OR_") \
                .replace(".", "_").replace("_or_", "_OR_").replace(" ", "_").replace("__", "_")
            
            
            
        if new_col in exist:
            renamed_column.append(f"{new_col}_")
        else:
            if new_col != "":
                exist.append(new_col)   
                renamed_column.append(new_col)
                
    return renamed_column

def format_save_to_csv(dfs, csv_path_base, object_name):
    """
    Formats the DataFrame and saves it to a CSV file.
    
    Args:
        dfs (list): List of DataFrames to be concatenated.
        csv_path_base (str): Base path for the CSV file.
        object_name (str): Name of the object to be used in the CSV file name.
    """

    df = pd.concat(dfs, ignore_index=True)
    renamed_columns = rename_column(df.columns.values)

    csv_path = csv_path_base.replace("<obj>", object_name)
    df.to_csv(csv_path, header=renamed_columns, index=False)
    print(f"Saved data to {csv_path}")
    print("\n")

In [0]:
import pandas as pd
import os

files = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
# print(files)

property_dfs = list()
cyber_dfs = list()
crime_dfs = list()
equipment_breakdown_dfs = list()
inland_marine_dfs = list()
epli_dfs = list()
agricultural_dfs = list()
liability_dfs = list()
commercial_auto_dfs = list()
umbrella_dfs = list()
fee_dfs = list()


for f in files:
    if f.lower().find("epli") != -1:
        continue
    
    data = pd.read_excel(f, sheet_name=None, header=6)
    sheets = [item for item in list(data.keys())]
    print(f"Processing file: {f}")
    print(f"Present sheets in file: {sheets}")

    for sheet_name in sheets:
        raw = data[sheet_name]
        row_count = len(raw)

        if row_count == 0:
            continue

        print(f"[{sheet_name}] row count: {row_count}")

        if sheet_name.lower() == "property":
            property_dfs.append(raw)
        elif sheet_name.lower() == "cyber":
            cyber_dfs.append(raw)
        elif sheet_name.lower() == "crime":
            crime_dfs.append(raw)
        elif sheet_name.lower() == "equipment breakdown":
            equipment_breakdown_dfs.append(raw)
        elif sheet_name.lower() == "inland marine":
            inland_marine_dfs.append(raw)
        elif sheet_name.lower() == "epli":
            epli_dfs.append(raw)
        elif sheet_name.lower() == "agricultural":
            agricultural_dfs.append(raw)
        elif sheet_name.lower() == "liability":
            liability_dfs.append(raw)
        elif sheet_name.lower() == "commercial auto":
            commercial_auto_dfs.append(raw)
        elif sheet_name.lower() == "umbrella":
            umbrella_dfs.append(raw)
        elif sheet_name.lower() == "fees" or sheet_name.lower() == "fee":
            fee_dfs.append(raw)

    print("\n")

if property_dfs:
    format_save_to_csv(property_dfs, csv_path_base, "property")
    
if cyber_dfs:
    format_save_to_csv(cyber_dfs, csv_path_base, "cyber")

if crime_dfs:
    format_save_to_csv(crime_dfs, csv_path_base, "crime")

if equipment_breakdown_dfs:
    format_save_to_csv(equipment_breakdown_dfs, csv_path_base, "equipment_breakdown")

if inland_marine_dfs:
    format_save_to_csv(inland_marine_dfs, csv_path_base, "inland_marine")

if epli_dfs:
    format_save_to_csv(epli_dfs, csv_path_base, "epli")

if agricultural_dfs:
    format_save_to_csv(agricultural_dfs, csv_path_base, "agricultural")

if liability_dfs:
    format_save_to_csv(liability_dfs, csv_path_base, "liability")

if commercial_auto_dfs:
    format_save_to_csv(commercial_auto_dfs, csv_path_base, "commercial_auto")

if umbrella_dfs:
    format_save_to_csv(umbrella_dfs, csv_path_base, "umbrella")

if fee_dfs:
    format_save_to_csv(fee_dfs, csv_path_base, "fee")