In [1]:
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings
import argparse
import pandas as pd
import hashlib
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

def hash_column(column):
    """Hash the values in a column using SHA-256."""
    return column.apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest() if pd.notnull(x) else x)

def generate_profiling_report(path_to_csv, sensitive_columns=None) -> pd.DataFrame:
    """
    Generate a profiling report (in html) using ydata-profiling
    
    Args:
        path_to_csv (str): path to csv 
        sensitive_columns (list): List of column names to mark as sensitive
        
    Returns:
        DataFrame: DataFrame with profiling information including completeness metrics
    """
    try:    
        file_name = str.split(path_to_csv, "/")[-1].split(".")[0]
        data = pd.read_csv(path_to_csv)
        
        # Store the total record count
        total_records = len(data)
        
        # Automatically detect sensitive columns
        sensitive_keywords = ["bvn", "id number", "nin", "passport", "driver", "identificationnumber", "chn"]
        sensitive_columns = [
            col for col in data.columns 
            if any(keyword in col.lower() for keyword in sensitive_keywords)
        ]
        
        # Hash sensitive columns
        if sensitive_columns:
            for col in sensitive_columns:
                print(f"Hashing sensitive column: {col}")
                data[col] = hash_column(data[col])
        
        # Configure settings to mark sensitive columns
        config = Settings()
        if sensitive_columns:
            config.variables.descriptions = {col: "Sensitive Data (Hashed)" for col in sensitive_columns}
        
        # Generate profiling report    
        profile = ProfileReport(
            data,
            title=f"{file_name} Profiling Report",
            explorative=True,
            config=config)
        
        # Generate HTML report
        #profile.to_file(f'{file_name}.html')
        
        # Get JSON data and convert to CSV without saving the JSON file
        json_data = profile.to_json()
        
        # Extract variables data directly from the JSON and save to CSV
        variables_df = pd.DataFrame(json.loads(json_data)['variables'])
        
        # Add total record count to each row
        for col in variables_df.columns:
            variables_df[col]['total_records'] = total_records
            
            # Calculate completeness percentage if the necessary data exists
            if 'count' in variables_df[col] and 'n_missing' in variables_df[col]:
                non_missing = variables_df[col]['count'] - variables_df[col]['n_missing']
                variables_df[col]['p_completeness'] = round((non_missing / total_records) * 100, 2)
            else:
                variables_df[col]['p_completeness'] = None
        
        #variables_df.transpose().to_csv(f'{file_name}_variables.csv', index=True)
        
        print(f"Variables statistics have successfully been copied into {file_name}_variables.csv file")
        
        return variables_df
        
    except FileNotFoundError:
        print(f"Error: File not found at the path '{path_to_csv}'. Please check the path and try again.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file at '{path_to_csv}' is empty.")
    except pd.errors.ParserError:
        print(f"Error: There was an issue parsing the file at '{path_to_csv}'. Is it a valid CSV?")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [23]:
dummy_data = "../data/dummy/dummy-data/asset_custdata.csv"

In [24]:
# Get file name from the argument passed
file_name = str.split(dummy_data, "/")[-1].split(".")[0]

In [25]:
#Generate report
var_df = generate_profiling_report(dummy_data)

Hashing sensitive column: CustomerBVN
Hashing sensitive column: ID Number


Summarize dataset: 100%|██████████| 15/15 [00:00<00:00, 19.67it/s, Completed]                       
Render JSON: 100%|██████████| 1/1 [00:00<00:00, 21.46it/s]

Variables statistics have successfully been copied into asset_custdata_variables.csv file





In [29]:
var_df = var_df.T

In [31]:
# Add this line to reset the index and create a new column with the column names
var_df = var_df.reset_index().rename(columns={'index': 'column_name'})

In [32]:
var_df

Unnamed: 0,column_name,n_distinct,p_distinct,is_unique,n_unique,p_unique,type,hashable,value_counts_without_nan,value_counts_index_sorted,...,block_alias_counts,n_block_alias,block_alias_char_counts,script_counts,n_scripts,script_char_counts,category_alias_counts,n_category,category_alias_char_counts,word_counts
0,CustAID,216,1.0,True,216,1.0,Categorical,True,"{'bdd640fb-0667-4ad1-9c80-317fa3b1799d': 1, '1...","{'01d74256-3860-4ab6-96a4-02f23ae8cc93': 1, '0...",...,{'ASCII': 7776},1,"{'ASCII': {'-': 864, '4': 604, 'a': 487, '8': ...","{'Common': 5184, 'Latin': 2592}",2,"{'Common': {'-': 864, '4': 604, '8': 469, '9':...","{'Decimal Number': 4320, 'Lowercase Letter': 2...",3,"{'Dash_Punctuation': {'-': 864}, 'Decimal_Numb...","{'bdd640fb-0667-4ad1-9c80-317fa3b1799d': 1, '1..."
1,CustomerBVN,213,0.986111,False,212,0.981481,Categorical,True,{'3973e022e93220f9212c18d0d0c543ae7c309e46640d...,{'02003e343641bc47361af297a6adcf04bb9a0d36f75c...,...,{'ASCII': 13824},1,"{'ASCII': {'4': 899, '8': 891, 'd': 890, '5': ...","{'Common': 8620, 'Latin': 5204}",2,"{'Common': {'4': 899, '8': 891, '5': 887, '3':...","{'Decimal Number': 8620, 'Lowercase Letter': 5...",2,"{'Decimal_Number': {'4': 899, '8': 891, '5': 8...",{'3973e022e93220f9212c18d0d0c543ae7c309e46640d...
2,ID Number,189,0.875,False,164,0.759259,Categorical,True,{'a7937b64b8caa58f03721bb6bacf5c78cb235febe0e7...,{'0282d9b79f42c74c1550b20ff2dd16aafc3fe5d8ae9a...,...,{'ASCII': 13824},1,"{'ASCII': {'0': 967, '3': 910, 'c': 905, 'e': ...","{'Common': 8666, 'Latin': 5158}",2,"{'Common': {'0': 967, '3': 910, '7': 882, '8':...","{'Decimal Number': 8666, 'Lowercase Letter': 5...",2,"{'Decimal_Number': {'0': 967, '3': 910, '7': 8...",{'a7937b64b8caa58f03721bb6bacf5c78cb235febe0e7...
3,Gender,2,0.009259,False,0,0.0,Categorical,True,"{'Female': 123, 'Male': 93}","{'Female': 123, 'Male': 93}",...,{'ASCII': 1110},1,"{'ASCII': {'e': 339, 'a': 216, 'l': 216, 'F': ...",{'Latin': 1110},1,"{'Latin': {'e': 339, 'a': 216, 'l': 216, 'F': ...","{'Lowercase Letter': 894, 'Uppercase Letter': ...",2,"{'Lowercase_Letter': {'e': 339, 'a': 216, 'l':...","{'female': 123, 'male': 93}"
4,Occupation,187,0.865741,False,161,0.74537,Categorical,True,"{'Consulting civil engineer': 4, 'Pilot, airli...","{'Academic librarian': 1, 'Accommodation manag...",...,{'ASCII': 4366},1,"{'ASCII': {'e': 459, 'r': 384, 'i': 375, 'a': ...","{'Latin': 4000, 'Common': 366}",2,"{'Latin': {'e': 459, 'r': 384, 'i': 375, 'a': ...","{'Lowercase Letter': 3774, 'Space Separator': ...",6,"{'Lowercase_Letter': {'e': 459, 'r': 384, 'i':...","{'engineer': 25, 'manager': 18, 'officer': 15,..."
5,Account Officer,216,1.0,True,216,1.0,Categorical,True,"{'Carol Colon': 1, 'Michael Sloan': 1, 'Kristi...","{'Aaron Carlson': 1, 'Alison Williams': 1, 'Am...",...,{'ASCII': 2865},1,"{'ASCII': {'e': 260, 'a': 257, ' ': 228, 'n': ...","{'Latin': 2631, 'Common': 234}",2,"{'Latin': {'e': 260, 'a': 257, 'n': 219, 'r': ...","{'Lowercase Letter': 2182, 'Uppercase Letter':...",4,"{'Lowercase_Letter': {'e': 260, 'a': 257, 'n':...","{'michael': 8, 'matthew': 6, 'smith': 6, 'mr':..."


In [None]:
    
print(f"Generated profiling report and saved to {file_name}.html.")