In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Path to the file
file_path = '/data/chats/igk4wd/workspace/uploads/Teste Power BI - Deveras RH.xlsx'

# List all sheets in the Excel file
excel_file = pd.ExcelFile(file_path)
sheet_names = excel_file.sheet_names
print("Excel sheets:", sheet_names)

# Create a dictionary to store dataframes from each sheet
dfs = {}

# Read each sheet and store in the dictionary
for sheet in sheet_names:
    dfs[sheet] = pd.read_excel(file_path, sheet_name=sheet)
    print(f"\n--- Sheet: {sheet} ---")
    print(f"Shape: {dfs[sheet].shape}")
    print("Column names:", list(dfs[sheet].columns))
    print("Data types:\n", dfs[sheet].dtypes)
    
    # Display first few rows
    print("First 3 rows:\n", dfs[sheet].head(3))
    
    # Check for missing values
    missing_values = dfs[sheet].isnull().sum()
    if missing_values.sum() > 0:
        print("Missing values:\n", missing_values[missing_values > 0])
    else:
        print("No missing values found.")
    
    # Check for duplicates
    duplicates = dfs[sheet].duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")

Excel sheets: ['BASE DE DADOS', 'Orientações', 'Alphaville', 'BH', 'SJC', 'PAULINIA', 'CAMPINAS']

--- Sheet: BASE DE DADOS ---
Shape: (16, 2)
Column names: ['CÓDIGO DO CLIENTE:', 'Unnamed: 1']
Data types:
 CÓDIGO DO CLIENTE:     object
Unnamed: 1            float64
dtype: object
First 3 rows:
           CÓDIGO DO CLIENTE:  Unnamed: 1
0  NOME COMPLETO DO CLIENTE:         NaN
1                       CPF:         NaN
2                  ENDEREÇO:         NaN
Missing values:
 Unnamed: 1    12
dtype: int64
Number of duplicate rows: 0

--- Sheet: Orientações ---
Shape: (17, 2)
Column names: ['Unnamed: 0', 'Unnamed: 1']
Data types:
 Unnamed: 0    object
Unnamed: 1    object
dtype: object
First 3 rows:
   Unnamed: 0 Unnamed: 1
0        NaN        NaN
1        NaN        NaN
2        NaN        NaN
Missing values:
 Unnamed: 0     9
Unnamed: 1    11
dtype: int64
Number of duplicate rows: 8

--- Sheet: Alphaville ---
Shape: (1651, 45)
Column names: ['Nº CONTRATO (SISTEMA GERA)', 'STATUS', 'CÓD CL

In [2]:
# Now let's continue our analysis by focusing on the remaining sheets
# and convert the VALOR column to numeric for analysis

# Let's look at the remaining sheets in more depth
location_sheets = ['Alphaville', 'BH', 'SJC', 'PAULINIA', 'CAMPINAS']

# Dictionary to store cleaned data frames
cleaned_dfs = {}

for sheet in location_sheets:
    if sheet in dfs:
        print(f"\n--- Detailed analysis of {sheet} sheet ---")
        
        # Convert VALOR column to numeric (removing any currency symbols, etc.)
        if 'VALOR' in dfs[sheet].columns:
            # First, check the current format
            print(f"VALOR column sample values: {dfs[sheet]['VALOR'].head(3).tolist()}")
            
            # Try to convert to numeric, coercing errors to NaN
            dfs[sheet]['VALOR_NUMERIC'] = pd.to_numeric(
                dfs[sheet]['VALOR'].str.replace('R\$', '').str.replace('.', '').str.replace(',', '.'),
                errors='coerce'
            )
            
            # Check if the conversion worked
            print(f"Converted VALOR sample: {dfs[sheet]['VALOR_NUMERIC'].head(3).tolist()}")
            print(f"NaN values after conversion: {dfs[sheet]['VALOR_NUMERIC'].isna().sum()}")
        
        # Check for data consistency between sheets
        print(f"Column count: {dfs[sheet].shape[1]}")
        
        # Analyze key statistics
        numeric_columns = dfs[sheet].select_dtypes(include=[np.number])
        if not numeric_columns.empty:
            print("\nNumeric columns statistics:")
            print(numeric_columns.describe().T)
        
        # Analyze categorical columns
        categorical_columns = dfs[sheet].select_dtypes(include=['object'])
        if not categorical_columns.empty:
            print("\nCategorical columns unique values count:")
            for col in categorical_columns.columns[:5]:  # Limiting to first 5 columns to avoid too much output
                print(f"{col}: {categorical_columns[col].nunique()} unique values")
                print(f"Top 3 values: {categorical_columns[col].value_counts().nlargest(3)}")
        
        # Check date columns for temporal range
        date_columns = dfs[sheet].select_dtypes(include=['datetime64'])
        if not date_columns.empty:
            print("\nDate ranges:")
            for col in date_columns.columns:
                min_date = date_columns[col].min()
                max_date = date_columns[col].max()
                print(f"{col}: {min_date} to {max_date}")
        
        # Store the dataframe with converted values
        cleaned_dfs[sheet] = dfs[sheet]

# Check for inconsistencies between location sheets
print("\n--- Checking inconsistencies between sheets ---")
if len(location_sheets) > 1 and all(sheet in dfs for sheet in location_sheets):
    # Check if all sheets have the same columns
    all_columns = [set(dfs[sheet].columns) for sheet in location_sheets]
    common_columns = set.intersection(*all_columns)
    
    print(f"Common columns across all location sheets: {len(common_columns)}")
    
    # Check for differences in columns
    for sheet in location_sheets:
        diff_columns = set(dfs[sheet].columns) - common_columns
        if diff_columns:
            print(f"{sheet} has {len(diff_columns)} unique columns: {diff_columns}")


--- Detailed analysis of Alphaville sheet ---
VALOR column sample values: [41545.78, 54270.99, 43074.45]
Converted VALOR sample: [nan, nan, nan]
NaN values after conversion: 1651
Column count: 46

Numeric columns statistics:
                             count          mean           std      min  \
Nº CONTRATO (SISTEMA GERA)  1651.0    826.000000    476.746963      1.0   
CÓD CLIENTE                 1651.0   8260.000000   4767.469629     10.0   
 NF                         1646.0   4019.770960    596.754145   2965.0   
QUINZENA                    1647.0      1.485124      0.499930      1.0   
Nº SESSÕES                  1651.0     14.933374      3.530188      9.0   
VALOR FINANCIADO            1651.0  47518.230999  21057.175113   3000.0   
VALOR PAGAMENTO FACTORING      9.0  37654.580000  14857.632211  10000.0   
VALOR_NUMERIC                  0.0           NaN           NaN      NaN   

                                 25%       50%        75%        max  
Nº CONTRATO (SISTEMA GERA) 

In [3]:
# Fix the VALOR conversion issue and complete the analysis of all sheets
import re

# Function to properly convert VALOR to numeric
def convert_to_numeric(value):
    if pd.isna(value):
        return np.nan
    if isinstance(value, (int, float)):
        return value
    
    # Remove non-numeric characters except for decimal separator
    # First, standardize the value format
    value_str = str(value)
    # Remove currency symbols, spaces, etc.
    cleaned = re.sub(r'[^\d,.]', '', value_str)
    # Convert Brazilian/European format (1.234,56) to US format (1234.56)
    if ',' in cleaned and '.' in cleaned:
        cleaned = cleaned.replace('.', '').replace(',', '.')
    elif ',' in cleaned:
        cleaned = cleaned.replace(',', '.')
    
    try:
        return float(cleaned)
    except:
        return np.nan

# Create a comprehensive comparison of all location sheets
location_sheets = ['Alphaville', 'BH', 'SJC', 'PAULINIA', 'CAMPINAS']
summary_data = {}

# Process each sheet
for sheet in location_sheets:
    if sheet in dfs:
        df = dfs[sheet].copy()
        
        print(f"\n--- Processing {sheet} sheet ---")
        
        # Convert VALOR to numeric
        if 'VALOR' in df.columns:
            # Apply the conversion function
            df['VALOR_NUMERIC'] = df['VALOR'].apply(convert_to_numeric)
            print(f"Converted VALOR sample: {df['VALOR_NUMERIC'].head(3).tolist()}")
            print(f"NaN values after conversion: {df['VALOR_NUMERIC'].isna().sum()}")
            
            # Basic statistics on VALOR
            if df['VALOR_NUMERIC'].notna().any():
                print(f"VALOR statistics: Min={df['VALOR_NUMERIC'].min()}, Max={df['VALOR_NUMERIC'].max()}, Mean={df['VALOR_NUMERIC'].mean()}")
        
        # Check for inconsistent column names (case, spacing, etc.)
        column_names = df.columns.tolist()
        cleaned_column_names = [col.strip().upper() for col in column_names]
        if len(set(cleaned_column_names)) != len(column_names):
            print("Warning: Potential inconsistent column naming (case or spacing)")
            duplicates = [col for col in cleaned_column_names if cleaned_column_names.count(col) > 1]
            if duplicates:
                print(f"Duplicate columns after normalization: {set(duplicates)}")
        
        # Check for inconsistencies in categorical data
        if 'UNIDADE' in df.columns:
            print(f"UNIDADE values in {sheet}: {df['UNIDADE'].unique()}")
        
        # Store sheet-specific data for summary
        summary_data[sheet] = {
            'row_count': df.shape[0],
            'column_count': df.shape[1],
            'missing_values': df.isnull().sum().sum(),
            'date_range': None
        }
        
        # Get date range if available
        date_cols = df.select_dtypes(include=['datetime64']).columns
        if len(date_cols) > 0:
            min_date = df[date_cols].min().min()
            max_date = df[date_cols].max().max()
            summary_data[sheet]['date_range'] = (min_date, max_date)
        
        # Store the cleaned dataframe
        cleaned_dfs[sheet] = df

# Summarize the findings
print("\n--- Summary of Data Analysis ---")
print(f"Total sheets analyzed: {len(location_sheets)}")
for sheet, data in summary_data.items():
    print(f"\n{sheet}:")
    print(f"  - Records: {data['row_count']}")
    print(f"  - Fields: {data['column_count']}")
    print(f"  - Missing values: {data['missing_values']}")
    if data['date_range']:
        print(f"  - Date range: {data['date_range'][0]} to {data['date_range'][1]}")

# Identify structure inconsistencies across sheets
print("\n--- Structural Inconsistencies ---")
all_columns = [set(dfs[sheet].columns) for sheet in location_sheets if sheet in dfs]
if all_columns:
    common_columns = set.intersection(*all_columns)
    all_columns_flat = set().union(*all_columns)
    
    print(f"Total unique columns across all sheets: {len(all_columns_flat)}")
    print(f"Common columns across all sheets: {len(common_columns)}")
    
    # Find columns unique to specific sheets
    for i, sheet in enumerate(location_sheets):
        if sheet in dfs:
            unique_cols = set(dfs[sheet].columns) - common_columns
            if unique_cols:
                print(f"{sheet} has {len(unique_cols)} unique columns: {unique_cols}")
    
    # Check type inconsistencies in common columns
    type_inconsistencies = []
    for col in common_columns:
        col_types = {}
        for sheet in location_sheets:
            if sheet in dfs and col in dfs[sheet].columns:
                col_type = str(dfs[sheet][col].dtype)
                if col_type not in col_types:
                    col_types[col_type] = []
                col_types[col_type].append(sheet)
        
        if len(col_types) > 1:
            type_inconsistencies.append((col, col_types))
    
    if type_inconsistencies:
        print("\nType inconsistencies in common columns:")
        for col, types in type_inconsistencies:
            print(f"Column '{col}' has different types:")
            for type_name, sheets in types.items():
                print(f"  - {type_name}: in sheets {', '.join(sheets)}")


--- Processing Alphaville sheet ---
Converted VALOR sample: [41545.78, 54270.99, 43074.45]
NaN values after conversion: 21
VALOR statistics: Min=3000.0, Max=124526.12, Mean=47585.67782822086
UNIDADE values in Alphaville: ['ALPHAVILLE' 'alpha ']

--- Processing BH sheet ---
Converted VALOR sample: [51231.24, 47328.33, 57888.45]
NaN values after conversion: 52
VALOR statistics: Min=8.0, Max=84354.34, Mean=28020.540530971128
UNIDADE values in BH: ['BELO HORIZONTE' 'OSASCO' 'ALPHAVILLE' 'BSB']

--- Processing SJC sheet ---
Converted VALOR sample: [20140120092136.0, 20160529185736.0, 20160829092136.0]
NaN values after conversion: 18
VALOR statistics: Min=2600.0, Max=26030602074048.0, Mean=11384180980684.389
UNIDADE values in SJC: ['DOMICILIAR' 'SJC' 'MORUMBI' 'LINS']

--- Processing PAULINIA sheet ---
Converted VALOR sample: [78661.77, 55947.52, 54432.92]
NaN values after conversion: 10
VALOR statistics: Min=6167.61, Max=21600212072624.0, Mean=1943374697564.2173
UNIDADE values in PAULINIA: