In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

print(os.getcwd())
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath("ds_proj1")))
os.chdir(ROOT_DIR)
print(os.getcwd())


d:\00_DEVELOPER\globant\ds_proj1\notebooks
d:\00_DEVELOPER\globant\ds_proj1


# Data Cleaning and Feature En

In [2]:
# Reading raw data
raw = pd.read_csv('data/interim/raw_aggregate.csv')

# Creating raw data copy
df_cleaned = raw.copy()

print(df_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22810 entries, 0 to 22809
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   TRANS DATE              22810 non-null  object 
 1   TRANS TAX DESC          22573 non-null  object 
 2   ORIGINAL GROSS AMT      22810 non-null  float64
 3   ORIGINAL CUR            22810 non-null  object 
 4   BILLING CUR CODE        14534 non-null  object 
 5   BILLING GROSS AMT       22810 non-null  float64
 6   TRANS TAX AMT           22810 non-null  float64
 7   TRANS ORIGINAL NET AMT  18762 non-null  float64
 8   MERCHANT NAME           22810 non-null  object 
 9   CARD NUMBER             22810 non-null  object 
 10  TRANS CAC CODE 1        22807 non-null  object 
 11  TRANS CAC CODE 2        22801 non-null  object 
 12  TRANS CAC DESC 2        7583 non-null   object 
 13  TRANS CAC CODE 3        19598 non-null  object 
 14  TRANS CAC CODE 4        22804 non-null

Merging Similar Columns: For the columns "DIRECTORATE" and "Directorate," it's essential to combine them into a single column if they represent the same feature. 

In [3]:
def merge_similar_columns(df, col1, col2):
    """
    Merge two similar columns in a DataFrame by filling the missing values 
    of one with the values from the other.
    """
    # Fill missing values in col1 with values from col2
    df[col1] = df[col1].fillna(df[col2])

    # Drop the redundant column
    df.drop(columns=[col2], inplace=True)

    return df

# Merging 'DIRECTORATE' and 'Directorate'
df_cleaned = merge_similar_columns(df_cleaned, 'DIRECTORATE', 'Directorate')

# Checking the first few rows after merging
df_cleaned.head()

Unnamed: 0,TRANS DATE,TRANS TAX DESC,ORIGINAL GROSS AMT,ORIGINAL CUR,BILLING CUR CODE,BILLING GROSS AMT,TRANS TAX AMT,TRANS ORIGINAL NET AMT,MERCHANT NAME,CARD NUMBER,...,TRANS CAC DESC 2,TRANS CAC CODE 3,TRANS CAC CODE 4,TRANS CAC CODE 5,TRANS CAC CODE 6,TRANS CAC CODE 7,TRANS CAC CODE 8,DIRECTORATE,BILLING CUR CODE.1,TRANS TAX RATE
0,2023-10-10,VR,99.67,GBP,GBP,99.67,16.61,83.06,weoley castle,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,,
1,2023-10-13,VR,99.5,GBP,GBP,99.5,16.58,82.92,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,,
2,2023-10-13,VR,77.47,GBP,GBP,77.47,12.91,64.56,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,,
3,2023-10-09,VR,117.6,GBP,GBP,117.6,19.6,98.0,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,,
4,2023-10-11,VR,106.81,GBP,GBP,106.81,17.8,89.01,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,,


## Handling Missing Values:

In [4]:
import pandas as pd

# Re-defining the necessary functions for the pipeline

def missing_values_handling(df, threshold_row_nan=30):
    """
    Función para manejar valores faltantes en un DataFrame.
    - Imprime el porcentaje de valores faltantes por columna.
    - Elimina filas con un alto porcentaje de valores faltantes.
    - Imputa valores faltantes en columnas numéricas y categóricas.
    """
    # Informe inicial de valores faltantes
    missing_values = df.isnull().sum()
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'missing_values': missing_values,
                                     'percent_missing': percent_missing})

    print("Informe inicial de valores faltantes por columna:\n", missing_value_df)

    # Eliminar filas con alto porcentaje de valores faltantes
    threshold = len(df.columns) * (100 - threshold_row_nan) / 100
    df.dropna(axis="index", thresh=threshold, inplace=True)

    # Imputación de valores faltantes
    for col in df.columns:
        if df[col].dtype == "float64" or df[col].dtype == "int64":
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)

    return df

# Aplicando la función de tratamiento de valores faltantes al DataFrame
df_cleaned = missing_values_handling(df_cleaned)

df_cleaned.head()

Informe inicial de valores faltantes por columna:
                                    column_name  missing_values  \
TRANS DATE                          TRANS DATE               0   
TRANS TAX DESC                  TRANS TAX DESC             237   
ORIGINAL GROSS AMT          ORIGINAL GROSS AMT               0   
ORIGINAL CUR                      ORIGINAL CUR               0   
BILLING CUR CODE              BILLING CUR CODE            8276   
BILLING GROSS AMT            BILLING GROSS AMT               0   
TRANS TAX AMT                    TRANS TAX AMT               0   
TRANS ORIGINAL NET AMT  TRANS ORIGINAL NET AMT            4048   
MERCHANT NAME                    MERCHANT NAME               0   
CARD NUMBER                        CARD NUMBER               0   
TRANS CAC CODE 1              TRANS CAC CODE 1               3   
TRANS CAC CODE 2              TRANS CAC CODE 2               9   
TRANS CAC DESC 2              TRANS CAC DESC 2           15227   
TRANS CAC CODE 3         

Unnamed: 0,TRANS DATE,TRANS TAX DESC,ORIGINAL GROSS AMT,ORIGINAL CUR,BILLING CUR CODE,BILLING GROSS AMT,TRANS TAX AMT,TRANS ORIGINAL NET AMT,MERCHANT NAME,CARD NUMBER,...,TRANS CAC DESC 2,TRANS CAC CODE 3,TRANS CAC CODE 4,TRANS CAC CODE 5,TRANS CAC CODE 6,TRANS CAC CODE 7,TRANS CAC CODE 8,DIRECTORATE,BILLING CUR CODE.1,TRANS TAX RATE
0,2023-10-10,VR,99.67,GBP,GBP,99.67,16.61,83.06,weoley castle,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
1,2023-10-13,VR,99.5,GBP,GBP,99.5,16.58,82.92,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
2,2023-10-13,VR,77.47,GBP,GBP,77.47,12.91,64.56,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
3,2023-10-09,VR,117.6,GBP,GBP,117.6,19.6,98.0,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
4,2023-10-11,VR,106.81,GBP,GBP,106.81,17.8,89.01,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0


## Outlier Detection and Handling: 

In [5]:
def detect_and_handle_outliers(df, threshold_outliers=100):
    """
    Función para detectar y manejar outliers en un DataFrame.
    Utiliza el método del rango intercuartílico (IQR).
    """
    # Seleccionando solo columnas numéricas
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

    for col in numerical_cols:
        # Calculando Q1, Q3 y IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        # Definiendo límites para detectar outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Contando outliers
        outliers_count = df[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0]

        # Opción de manejo de outliers
        if outliers_count > threshold_outliers:
            # Aquí puedes decidir si eliminar o ajustar los outliers
            # Por ejemplo, podríamos limitar los outliers a los límites superior e inferior
            df[col] = df[col].clip(lower_bound, upper_bound)

        print(f"Column '{col}': {outliers_count} outliers detected")

    return df

# Aplicando la función de detección y manejo de outliers al DataFrame
df_cleaned = detect_and_handle_outliers(df_cleaned)

df_cleaned.head()

Column 'ORIGINAL GROSS AMT': 2539 outliers detected
Column 'BILLING GROSS AMT': 2536 outliers detected
Column 'TRANS TAX AMT': 1907 outliers detected
Column 'TRANS ORIGINAL NET AMT': 2671 outliers detected
Column 'TRANS TAX RATE': 1234 outliers detected


Unnamed: 0,TRANS DATE,TRANS TAX DESC,ORIGINAL GROSS AMT,ORIGINAL CUR,BILLING CUR CODE,BILLING GROSS AMT,TRANS TAX AMT,TRANS ORIGINAL NET AMT,MERCHANT NAME,CARD NUMBER,...,TRANS CAC DESC 2,TRANS CAC CODE 3,TRANS CAC CODE 4,TRANS CAC CODE 5,TRANS CAC CODE 6,TRANS CAC CODE 7,TRANS CAC CODE 8,DIRECTORATE,BILLING CUR CODE.1,TRANS TAX RATE
0,2023-10-10,VR,99.67,GBP,GBP,99.67,16.61,83.06,weoley castle,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
1,2023-10-13,VR,99.5,GBP,GBP,99.5,16.58,82.92,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
2,2023-10-13,VR,77.47,GBP,GBP,77.47,12.91,64.56,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
3,2023-10-09,VR,117.6,GBP,GBP,117.6,19.6,98.0,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
4,2023-10-11,VR,106.81,GBP,GBP,106.81,17.8,89.01,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0


## Remove Duplicates

In [6]:
def remove_duplicates(df, id_column=None):
    """
    Función para eliminar duplicados en un DataFrame.
    - Primero elimina duplicados completos.
    - Luego, si se proporciona, elimina duplicados basados en una columna específica.
    """
    initial_shape = df.shape

    # Eliminar duplicados completos
    df.drop_duplicates(inplace=True)

    # Eliminar duplicados basados en una columna específica (si se proporciona)
    #if id_column and id_column in df.columns:
    #    df.drop_duplicates(subset=id_column, inplace=True)

    final_shape = df.shape
    print(f"Duplicados eliminados. Tamaño inicial: {initial_shape}, Tamaño final: {final_shape}")

    return df

# Aplicando la función para eliminar duplicados
df_cleaned = remove_duplicates(df_cleaned, id_column='CARD NUMBER')

df_cleaned.head()

Duplicados eliminados. Tamaño inicial: (22804, 22), Tamaño final: (21532, 22)


Unnamed: 0,TRANS DATE,TRANS TAX DESC,ORIGINAL GROSS AMT,ORIGINAL CUR,BILLING CUR CODE,BILLING GROSS AMT,TRANS TAX AMT,TRANS ORIGINAL NET AMT,MERCHANT NAME,CARD NUMBER,...,TRANS CAC DESC 2,TRANS CAC CODE 3,TRANS CAC CODE 4,TRANS CAC CODE 5,TRANS CAC CODE 6,TRANS CAC CODE 7,TRANS CAC CODE 8,DIRECTORATE,BILLING CUR CODE.1,TRANS TAX RATE
0,2023-10-10,VR,99.67,GBP,GBP,99.67,16.61,83.06,weoley castle,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
1,2023-10-13,VR,99.5,GBP,GBP,99.5,16.58,82.92,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
2,2023-10-13,VR,77.47,GBP,GBP,77.47,12.91,64.56,nts esso weoley petrol fi,************0140,...,AV0AR,3A01,E00,JZZZZZ,TV01R,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
3,2023-10-09,VR,117.6,GBP,GBP,117.6,19.6,98.0,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0
4,2023-10-11,VR,106.81,GBP,GBP,106.81,17.8,89.01,mfg waterlinks,************8738,...,AV0AR,3A01,E00,JZZZZZ,TV01V,JZZZ,JXXX,ADULT SOCIAL CARE,GBP,20.0


## Column Renaming and Data Exporting

In [7]:
df_cleaned.info()

    # Renombrando columnas: minúsculas y reemplazando espacios con guiones bajos
df_cleaned.columns = df_cleaned.columns.str.lower().str.replace(' ', '_')

    # Exportando a CSV
df_cleaned.to_csv('data/interim/df_cleaned.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 21532 entries, 0 to 22809
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   TRANS DATE              21532 non-null  object 
 1   TRANS TAX DESC          21532 non-null  object 
 2   ORIGINAL GROSS AMT      21532 non-null  float64
 3   ORIGINAL CUR            21532 non-null  object 
 4   BILLING CUR CODE        21532 non-null  object 
 5   BILLING GROSS AMT       21532 non-null  float64
 6   TRANS TAX AMT           21532 non-null  float64
 7   TRANS ORIGINAL NET AMT  21532 non-null  float64
 8   MERCHANT NAME           21532 non-null  object 
 9   CARD NUMBER             21532 non-null  object 
 10  TRANS CAC CODE 1        21532 non-null  object 
 11  TRANS CAC CODE 2        21532 non-null  object 
 12  TRANS CAC DESC 2        21532 non-null  object 
 13  TRANS CAC CODE 3        21532 non-null  object 
 14  TRANS CAC CODE 4        21532 non-null