In [83]:
# Import required libraries and user sensible data
import pandas as pd
import sweetviz as sv
import warnings
from sqlalchemy import create_engine
from sql.postgres_connection import (dbname, password, host, port, database)
from sql.sql_queries import (sql_query)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

# Suppress FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

# Database connection function
def read_data_from_postgres(query):
    try:
        engine = create_engine(f'postgresql://{dbname}:{password}@{host}:{port}/{database}')
        df = pd.read_sql_query(query, engine)
        
    except Exception as e:
        print("An error occurred:", e)

    return df

df = read_data_from_postgres(sql_query)

df_original = df.copy()

df.describe()

Unnamed: 0,codigo_grupo,codigo_classe,codigo_pdm,codigo_item
count,314843.0,314843.0,314843.0,314843.0
mean,65.068377,6535.329698,9001.508183,339902.489546
std,14.42089,1441.112417,6700.744336,111031.313182
min,10.0,1005.0,1.0,19.0
25%,61.0,6145.0,2122.0,262789.5
50%,65.0,6515.0,9119.0,341501.0
75%,71.0,7110.0,14269.0,420211.5
max,99.0,9999.0,30138.0,611192.0


In [None]:
# SweetViz Setup
feature_config = sv.FeatureConfig(skip=()) # Possible parameters: skip, force_cat, force_num and force_text
my_report = sv.analyze(df, feat_cfg=feature_config)

# Export Report to HTML
my_report.show_html(filepath='sweetviz_report.html', 
                    open_browser=True, 
                    layout='vertical', 
                    scale=None)

# # Print Report to Notebook
# my_report.show_notebook(w="100%",
#                         h=None, 
#                         scale=None, 
#                         layout='vertical',
#                         filepath=None)

In [100]:
df = df_original.copy()

# Create a new column with the concatenation of codigo_grupo and codigo_classe paded with the 4 digits
df['codigo_grupo_norm'] = df['codigo_grupo'].copy()
df['codigo_classe_norm'] = df['codigo_grupo'].astype(str) + df['codigo_classe'].astype(str).str.pad(width=4, side='left', fillchar='0')
df['codigo_pdm_norm'] = df['codigo_classe_norm'].astype(str) + df['codigo_pdm'].astype(str).str.pad(width=5, side='left', fillchar='0')
df['codigo_item_norm'] = df['codigo_pdm_norm'].astype(str) + df['codigo_item'].astype(str).str.pad(width=6, side='left', fillchar='0')

# Create copies of all columns with sufix '_norm' in the name and use OrdinalEncoder to normalize them
df['codigo_grupo_norm'] = OrdinalEncoder().fit_transform(df[['codigo_grupo_norm']])
df['codigo_classe_norm'] = OrdinalEncoder().fit_transform(df[['codigo_classe_norm']])
df['codigo_pdm_norm'] = OrdinalEncoder().fit_transform(df[['codigo_pdm_norm']])
df['codigo_item_norm'] = OrdinalEncoder().fit_transform(df[['codigo_item_norm']])

# df.select_dtypes(include=['int']).head(10)  # Check if the new columns were created and the values normalized
# df.loc[3].head(10)
# Show head of only 4 columns
df[['codigo_grupo_norm', 'codigo_classe_norm', 'codigo_pdm_norm', 'codigo_item_norm']].head(10)
# df.describe()

Unnamed: 0,codigo_grupo_norm,codigo_classe_norm,codigo_pdm_norm,codigo_item_norm
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,3.0
4,0.0,0.0,0.0,4.0
5,0.0,0.0,0.0,5.0
6,0.0,0.0,0.0,6.0
7,0.0,0.0,0.0,7.0
8,0.0,0.0,0.0,8.0
9,0.0,0.0,0.0,9.0
