In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

# 1. Carga de Datos
def load_data(file_path):
    return pd.read_excel("datos/Export.xlsx", skiprows=7)

# 2. Selección Inicial de Variables (Basada en EDA)
def initial_variable_selection(df):
    # Variables seleccionadas según el EDA
    selected_variables = [
        "Company",
        "Cash and Cash Equivalents",    
        "Export",
        "Import",
        "Industry (NAICS)",
        "Long term Debt",
        "Net Sales Revenue Trend (%)",
        "Number of Employees",
        "Operating Profit Trend (%)",
        "Property, plant and equipment",
        "Return on Assets (ROA) (%)",
        "Return on Equity (ROE) (%)",
        "Quick Ratio (x)",
        "Short Term Debt",      
        "Total operating revenue",
    ]
    return df[selected_variables]

# 3. Limpieza de Datos
def clean_data(df):
    # Asignar dtypes correctos
    for col in df.columns:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            df[col] = df[col].astype(str)
    
    # Identificar columnas categóricas y numéricas
    numeric_columns = df.select_dtypes(include=['float64']).columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    # Imputación
    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])
    df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])
    return df

# 4. Transformación de Datos
def transform_data(df):
    # Evitar divisiones por cero
    df['Property, plant and equipment'] = df['Property, plant and equipment'].replace(0, np.nan)
    df['Debt_to_Assets'] = (df['Long term Debt'] + df['Short Term Debt']) / df['Property, plant and equipment']
    df['Relative_Growth'] = df['Net Sales Revenue Trend (%)'] - df['Operating Profit Trend (%)']
    
    # Reemplazar infinitos y NaN generados
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)  # Rellenar valores faltantes con 0 después de las transformaciones
    return df

# 5. Preprocesamiento y Normalización
def prepare_pipeline(df):
    # Identificar columnas categóricas y numéricas
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    numeric_features = df.select_dtypes(include=['float64']).columns.tolist()
    
    # Preprocesamiento para variables numéricas
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    # Preprocesamiento para variables categóricas
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combinar preprocesamiento
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'  # Eliminar columnas no especificadas
    )
    return preprocessor

# 6. Reducción de Dimensionalidad con PCA
def apply_pca(data, n_components=14):
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(data)
    explained_variance = pca.explained_variance_ratio_
    loadings = pca.components_
    return principal_components, explained_variance, loadings, pca

# Pipeline Completo
def run_pipeline(file_path):
    # Carga de datos
    data = load_data(file_path)
    
    # Selección inicial de variables
    selected_data = initial_variable_selection(data)
    
    # Limpieza de datos
    clean_data_df = clean_data(selected_data)
    
    # Transformación de datos
    transformed_data = transform_data(clean_data_df)
    
    # Preprocesamiento
    preprocessor = prepare_pipeline(transformed_data)
    processed_data = preprocessor.fit_transform(transformed_data)
    
    # Reducción de dimensionalidad
    pca_data, explained_variance, loadings, pca_model = apply_pca(processed_data)
        
    return pca_data, explained_variance, loadings, pca_model

# Uso del pipeline
file_path = 'preprocessed_data.csv'  # Ruta al archivo CSV
pca_data, explained_variance, loadings, pca_model = run_pipeline(file_path)

# Resultados:
# - pca_data: Datos transformados y listos para clustering.
# - explained_variance: Varianza explicada por cada componente.
# - loadings: Cargas de las variables en cada componente.
# - pca_model: Modelo PCA para análisis adicional.

# Mostrar las primeras filas del DataFrame final
print(pca_data[:5])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

[[ 1.48175855e+02 -2.40642200e-01  1.30448717e-01 -4.88744582e+01
  -1.15112575e+00  1.63563742e+00  1.57636861e+00  1.75293957e-01
  -4.88672805e+00  2.35925424e+00  9.12255638e-01 -2.54234765e+00
   2.34767964e+00 -3.38617040e-01]
 [ 1.54939213e+01  2.49676120e-02 -1.18656569e-02 -9.86444922e+00
  -1.65049352e-01  2.25401975e-01  2.31771327e-01  2.76182299e-02
  -9.12027611e-01 -1.06271501e+00 -2.37816358e+00  1.43552615e+01
   2.39087725e+00 -3.41660362e-01]
 [ 7.53873514e+00  8.16611012e-03 -1.38100684e-02 -5.55620686e+00
  -1.23311299e-01  1.13253383e-01  1.04098897e-01 -4.10903303e-02
   6.39423615e-01 -1.03183713e+00 -1.39310702e+00  6.69419283e+00
  -4.83979730e-01  1.23290281e-01]
 [ 2.10215917e+01 -5.01878847e-02 -1.64141623e-02 -1.00926213e+01
  -2.98456428e-01  3.00579527e-01  2.73798823e-01 -2.61329314e-04
  -3.17936135e-01 -3.27771773e-01  1.87025103e-01 -3.63928859e+00
  -8.02608442e+00  1.42289955e+00]
 [ 7.40631817e+00  4.41147822e-03 -1.92335132e-02 -6.39573565e+00
  

In [3]:
pca_data

array([[ 1.48175855e+02, -2.40642200e-01,  1.30448717e-01, ...,
        -2.54234765e+00,  2.34767964e+00, -3.38617040e-01],
       [ 1.54939213e+01,  2.49676120e-02, -1.18656569e-02, ...,
         1.43552615e+01,  2.39087725e+00, -3.41660362e-01],
       [ 7.53873514e+00,  8.16611012e-03, -1.38100684e-02, ...,
         6.69419283e+00, -4.83979730e-01,  1.23290281e-01],
       ...,
       [-7.16696456e-02, -7.06374692e-03, -3.60749225e-02, ...,
        -1.04872595e-01, -2.05632638e-01, -6.16037848e-01],
       [-7.16696456e-02, -7.06374692e-03, -3.60749225e-02, ...,
        -1.04872595e-01, -2.05632638e-01, -6.16037848e-01],
       [-7.16696456e-02, -7.06374692e-03, -3.60749225e-02, ...,
        -1.04872595e-01, -2.05632638e-01, -6.16037848e-01]])

In [4]:
explained_variance

array([0.20204585, 0.12389603, 0.09361155, 0.07132571, 0.0627128 ,
       0.06228652, 0.06217654, 0.0308118 , 0.03035011, 0.02092784,
       0.00616885, 0.00506666, 0.00228089, 0.00209759])

In [5]:
loadings

array([[ 3.92404941e-01,  5.37242899e-01,  1.63194234e-03, ...,
         4.74256389e-06, -2.69365685e-06,  1.23051068e-05],
       [-2.55035950e-03, -2.51650184e-03,  3.39027878e-01, ...,
        -1.59706768e-07,  7.36245402e-08,  2.83650144e-03],
       [-7.95596087e-05,  5.70998909e-04,  9.42327013e-03, ...,
        -2.76272970e-06, -1.20484456e-06,  1.52877035e-03],
       ...,
       [ 7.62507537e-02, -4.28775851e-01, -6.70800330e-03, ...,
         1.33891480e-04, -1.68398335e-05,  2.39153471e-03],
       [-8.21190371e-02,  6.63000783e-01, -2.24918414e-03, ...,
        -4.23454000e-05,  4.19693671e-05, -2.06840900e-01],
       [ 1.51033262e-02, -1.21882089e-01, -6.44498888e-03, ...,
         8.24995181e-06,  2.53790783e-05, -5.69057557e-01]])

In [6]:
pca_model