
Notebooks below are used to run in local machine

Data cleansing process:
1. Pandas will read the excel file (xlsx)
2. Drop null data (usually the first row for each object)
3. Loop through each column to match data with new line '`n' and replace them with ',' (comma)
4. Concat the list of DataFrame into one DF per object
5. Get the column names (from the concatenated DF) and rename them (using the function `rename_column`)
6. Save the final DF as csv and set the renamed columns as the new header

In [0]:
def rename_column(columns):
    """
    Renames the column names to a standardized format.
    
    Args:
        columns (list): List of column names to be renamed.
        
    Returns:
        list: Transformed column names.
    """
    
    # Initialize an empty list to hold the renamed column names
    
    renamed_column = []

    for col in columns:
        if col == "Policy #":
            renamed_column.append("policy_number")
        elif col == "Policy":
            renamed_column.append("policy_number")
        elif col == "RateMethod":
            renamed_column.append("rate_method")
        elif col == "State.1":
            renamed_column.append("state_1")
        elif col == "StatusTypeDescription":
            renamed_column.append("status_type_description")
        elif col == "PolicyTypeDescription":
            renamed_column.append("policy_type_description")
        elif col == "TransactionType":
            renamed_column.append("transaction_type")
        elif col == "PriorPolicyNumber":
            renamed_column.append("prior_policy_number")
        elif col == "InceptionDate":
            renamed_column.append("inception_date")
        elif col == "ExpirationDate":
            renamed_column.append("expiration_date")
        elif col == "EndorsementNumber":
            renamed_column.append("endorsement_number")
        elif col == "TransactionEffectiveDate":
            renamed_column.append("transaction_effective_date")
        elif col == "RatingCompany":
            renamed_column.append("rating_company")
        elif col == "InsuredName":
            renamed_column.append("insured_name")
        elif col == "AddressLine1":
            renamed_column.append("address_line_1")
        elif col == "AddressLine2":
            renamed_column.append("address_line_2")
        elif col == "ZipCode":
            renamed_column.append("zip_code")
        elif col == "MGUCode":
            renamed_column.append("mgu_code")
        elif col == "AuditFreq":
            renamed_column.append("audit_freq")
        elif col == "CommissionPrec":
            renamed_column.append("commission_prec")
        elif col == "GrossPremium":
            renamed_column.append("gross_premium")
        elif col == "OccLmt":
            renamed_column.append("occ_lmt")
        elif col == "AggLmt":
            renamed_column.append("agg_lmt")
        elif col == "ClassCode":
            renamed_column.append("class_code")
        elif col == "PolType":
            renamed_column.append("pol_type")
        elif col == "RateMod":
            renamed_column.append("rate_mod")
        elif col == "ProRate":
            renamed_column.append("pro_rate")
        elif col == "#Prem":
            renamed_column.append("num_prem")
        elif col == "#RatEmp":
            renamed_column.append("num_rat_emp")
        elif col == "TotEmp":
            renamed_column.append("tot_emp")
        elif col == "CoverageDescription":
            renamed_column.append("coverage_description")
        elif col == "PolicyNumber":
            renamed_column.append("policy_number")
        elif col == "CommissionPerc":
            renamed_column.append("commission_perc")
        elif col == "City.1":
            renamed_column.append("city_1")
        elif col == "ChangeLongDesc":
            renamed_column.append("change_long_desc")
        elif col == "Type of Coverage.1":
            renamed_column.append("type_of_coverage_1")
        elif col == "Wind Coverage.1":
            renamed_column.append("wind_coverage_1")
        else:
            renamed_column.append(col.strip().replace("Y/N", "").replace(" &", "").replace(" / ", "_").replace("/", "_") \
                .replace(" /", "_").replace("/ ", "_").replace(" - ", "_").replace(" -", "_").replace("- ", "_") \
                .replace("-", "_").replace("(","").replace(")","").replace("%", "perc").strip().replace(" ", "_").lower())
            
    return renamed_column

In [0]:
import pandas as pd
import os

year = '2019'
directory = 'C:\\Bordereaux\\raw'
files = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
# print(files)

objects = ["property", "crime", "inland marine", "agricultural", "liability"]
for obj in objects:
    dfs = list()
    for f in files:

        # header=6 is used to skip the first 6 rows which are not part of the data
        data = pd.read_excel(f, sheet_name=None, header=6)
        raw = data[obj]
        row_count = len(raw)
        if row_count == 0:
            print(f"No data found for [{obj}] in {f}")
            continue

        print(f"count: {len(raw)} - [{obj}] - {f}")
        cols = raw.columns
        # remove the nan valued data
        cleaned = raw.drop(raw[raw[cols[0]].isna()].index)

        # transform the multi-line values to single line (comma separated)
        for col in cols:
            cleaned[col] = cleaned[col].astype(str).str.replace("\n",", ")

        # add the cleaned data to the list of dataframes
        dfs.append(cleaned) 
    
    # 
    df = pd.concat(dfs, ignore_index=True)
    
    columns = df.columns.values
    renamed_columns = rename_column(columns)

    csv_path = f"C:\\Bordereaux\\bronze\\{obj.replace(' ', '_')}-bordereaux-{year}.csv"

    df.to_csv(csv_path, header=renamed_columns, index=False)
    print(f"Saved {obj} data to {csv_path}")
    print(f"Total rows for {obj}: {len(df)}")
    print("\n\n")