In [1]:
import pandas as pd
import os

In [2]:
current_path = os.getcwd()
print(current_path)
base_path = os.path.abspath(os.path.join(current_path, os.pardir))
print(base_path)
bo2cs_file_path = os.path.join(base_path, 'data', 'raw', 'tbl_bo2cs.txt')
print(bo2cs_file_path)
bo2cs_preprocessed_path = os.path.join(base_path, 'data', 'preprocessed', 'tbl_bo2cs.csv')
print(bo2cs_preprocessed_path)
bo2cs_processed_path = os.path.join(base_path, 'data', 'processed', 'tbl_bo2cs.csv')
print(bo2cs_processed_path)


c:\Users\sepujas\Dev\mat\notebooks
c:\Users\sepujas\Dev\mat
c:\Users\sepujas\Dev\mat\data\raw\tbl_bo2cs.txt
c:\Users\sepujas\Dev\mat\data\preprocessed\tbl_bo2cs.csv
c:\Users\sepujas\Dev\mat\data\processed\tbl_bo2cs.csv


In [3]:
df_bo2cs = pd.read_csv(bo2cs_processed_path, sep='|', skiprows=0, encoding='latin1')


Unnamed: 0,SOrg,SaTy,Sales_Doc,Sold_to,Name_1,Ship_to,Name_11,CustLoy,FocCust,Territory,...,BOstatus,BO,BO_status_changed,GM,unc,Description,SC,Route,Reqdlvdt,key_material
0,8650,ZRO,7210671323,15919708,HUELQUEN SA,15919708,HUELQUEN SA,,,TCL5100010,...,Unproc,X,,A,,REPAIR PARTS,0F,ZCL913,2017-12-20,8650/301998
1,8650,ZRO,7211112567,15917697,EMPRESA CONSTRUCTORA BRAVO E,24583539,242 EDIFICIO CONTEMPORA,,X,TCL3400012,...,Unproc,X,,A,,REPAIR PARTS,0E,ZCL915,2018-02-16,8650/203086
2,8650,ZRO,7211177692,23023419,CONSORCIO VALLE HERMOSO S.A,23023498,EMBALSE VALLE HERMOSO,,,TCL2600018,...,Unproc,X,,A,,REPAIR PARTS,0E,ZCL121,2018-02-28,8650/203086
3,8650,ZRO,7211203648,23023419,CONSORCIO VALLE HERMOSO S.A,23023498,EMBALSE VALLE HERMOSO,,,TCL2600018,...,Unproc,X,,A,,REPAIR PARTS,0E,ZCL121,2018-02-28,8650/203086
4,8650,ZRO,7211194698,23106139,AMPUERO Y COFRE CONSTRUCCION,24958240,CONDOMINIO,,,TCL3100010,...,Unproc,X,,A,,REPAIR PARTS,0E,ZCL915,2018-03-01,8650/203086


In [5]:
def transform_columns(df):
    str_columns = [
        'SOrg', 'SaTy', 'Sales_Doc', 'Sold_to', 'Name_1', 'Name_11', 'CustLoy', 'FocCust', 'Territory',
        'Typ', 'Material', 'Item_Description', 'SU', 'BO_value', 'Curr', 'DS', 'DB', 'BOstatus',
        'BO', 'GM', 'unc', 'Description', 'SC', 'Route'
    ]
    for col in str_columns:
        df[col] = df[col].astype(str).str.strip()

    numeric_columns = ['SOrg', 'Sales_Doc', 'Material']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int).astype(str)

    date_columns = [
        'Created_on', 'InitReqDt', 'promised', 'MatAvDt', 'DlvDate', 'BO_status_changed', 'Reqdlvdt'
    ]
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce', format='%d.%m.%Y')

    float_columns = ['Order_Qty', 'Corrqty', 'ConfirmQty']
    for col in float_columns:
        df[col] = df[col].astype(str).str.replace('.', '').str.replace(',', '.')
        # Verificar y limpiar datos no numéricos
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

def transform_bo2cs(bo2cs_file_path, bo2cs_processed_path, bo2cs_exported_path, bo2cs_uploaded_path):
    try:
        df_bo2cs = pd.read_csv(bo2cs_file_path, sep='\t', skiprows=3, encoding='latin1')

        # Eliminar columnas sin nombre
        unnamed_columns = [col for col in df_bo2cs.columns if 'Unnamed:' in col]
        df_bo2cs.drop(columns=unnamed_columns, inplace=True)

        clean_column_names(df_bo2cs)
        fix_duplicate_columns(df_bo2cs)
        transform_columns(df_bo2cs)

        # Crear nueva columna 'key_material'
        df_bo2cs['key_material'] = (df_bo2cs['SOrg'] + '/' + df_bo2cs['Material']).astype(str).str.strip()

        # Guardar los archivos transformados
        df_bo2cs.to_csv(bo2cs_processed_path, index=False, encoding='latin1', sep='|')
        df_bo2cs.to_csv(bo2cs_exported_path, index=False, encoding='latin1', sep='|')
        # df_bo2cs.to_csv(bo2cs_uploaded_path, index=False, encoding='latin1', sep='|')

        return df_bo2cs

    except FileNotFoundError as e:
        print(f"File not found: {e.filename}")
    except pd.errors.ParserError as e:
        print(f"Error parsing file: {e}")
    except Exception as e:
        print(f"Error processing file: {e}")

    return None

base_path = os.getcwd()
bo2cs_file_path, bo2cs_processed_path, bo2cs_exported_path, bo2cs_uploaded_path = get_file_paths(base_path)
transform_bo2cs(bo2cs_file_path, bo2cs_processed_path, bo2cs_exported_path, bo2cs_uploaded_path)

In [7]:
df_bo2cs.dtypes

SOrg                   int64
SaTy                  object
Sales_Doc              int64
Sold_to               object
Name_1                object
Ship_to               object
Name_11               object
CustLoy               object
FocCust               object
Territory             object
Created_on            object
Typ                   object
Material               int64
Item_Description      object
Order_Qty            float64
Corrqty              float64
ConfirmQty           float64
SU                    object
BO_value              object
Curr                  object
InitReqDt             object
promised              object
MatAvDt               object
DlvDate               object
DS                    object
DB                    object
BOstatus              object
BO                    object
BO_status_changed    float64
GM                    object
unc                   object
Description           object
SC                    object
Route                 object
Reqdlvdt      