In [10]:
import os
import shutil
import pandas as pd


In [11]:
# Setup folders and paths
# Folder names
# Assuming the current folder is the scripts folder which is
# one level deep from the parent, project folder.
current_directory = os.getcwd()
project_path = os.path.dirname(current_directory)

# print(project_path)

raw_data_folder = "rawdata"  # folder name where all the raw csv files reside
output_folder = "profiling_reports"  # folder name where all the profile reports will reside

# Paths
raw_path = os.path.join(project_path, raw_data_folder)
profile_report_path = os.path.join(project_path, output_folder)
# print(f"Raw Path: {raw_path}")
# print(f"Profile Path: {profile_report_path}")

# Create the output folder if it doesn't exist
os.makedirs(profile_report_path, exist_ok=True)

In [12]:
# Optional: skip primary file
primary_file = "PBJ_Daily_Nurse_Staffing_Q2_2024.csv"

In [13]:
for filename in os.listdir(raw_path):
    if (filename.startswith('NH') and filename.endswith(".csv")) or filename == primary_file:
        filepath = os.path.join(raw_path, filename)
        print(f"Processing: {filename}")

        try:
            df = pd.read_csv(filepath, encoding='ISO-8859-1', low_memory=False)  # Use encoding ISO-8859-1 since it's more broad

            # Check for duplicate column names
            duplicate_cols = df.columns[df.columns.duplicated()].tolist()

            # Check for duplicate rows
            duplicated_rows = df[df.duplicated()]
            
            # Build profiling string
            report = []

            report.append(f"=== File: {filename} ===")
            report.append(f"Shape: {df.shape}")
            report.append("\n--- First 5 Rows ---")
            report.append(df.head(5).to_string(index=False))
            report.append("\n--- Column Info ---")
            report.append(df.dtypes.to_string())

            report.append("\n--- Duplicate Columns ---")
            if duplicate_cols:
                report.append(f"{duplicate_cols}")
            else:
                report.append("None")
            
            report.append("\n--- Duplicate Rows ---")
            if duplicated_rows.shape[0] > 0:
                report.append(f"{duplicated_rows.shape[0]} duplicate rows\n")
                report.append(f"{duplicated_rows.head(5).to_string(index=False)}")
            else:
                report.append("None")
            
            report.append("\n--- Missing Values ---")
            report.append(df.isnull().sum().to_string())
            report.append("\n--- Describe (transposed) ---")
            report.append(df.describe(include='all').T.to_string())

            # Save to text file
            output_path = os.path.join(profile_report_path, filename.replace(".csv", "_profile.txt"))
            with open(output_path, "w", encoding='utf-8') as f:
                f.write("\n\n".join(report))

        except Exception as e:
            print(f"Error reading {filename}: {e}")

Processing: NH_Penalties_Oct2024.csv
Processing: NH_SurveySummary_Oct2024.csv
Processing: NH_SurveyDates_Oct2024.csv
Processing: NH_CitationDescriptions_Oct2024.csv
Processing: NH_HlthInspecCutpointsState_Oct2024.csv
Processing: NH_FireSafetyCitations_Oct2024.csv
Processing: NH_ProviderInfo_Oct2024.csv
Processing: NH_Ownership_Oct2024.csv
Processing: PBJ_Daily_Nurse_Staffing_Q2_2024.csv
Processing: NH_StateUSAverages_Oct2024.csv
Processing: NH_CovidVaxProvider_20241027.csv
Processing: NH_QualityMsr_MDS_Oct2024.csv
Processing: NH_HealthCitations_Oct2024.csv
Processing: NH_DataCollectionIntervals_Oct2024.csv
Processing: NH_QualityMsr_Claims_Oct2024.csv
Processing: NH_CovidVaxAverages_20241027.csv
