In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt


load_dotenv()

True

In [6]:
from pyspark.sql.functions import col, isnan, when, count
from pyspark.sql import functions as F


In [3]:
def get_env_vars() -> dict:
    """
    Obtiene las variables de entorno al momento de la ejecución,
    para soportar cambios dinámicos (ej. en tests).
    """
    return {
        'aws_access_key_id':     os.getenv('AWS_ACCESS_KEY_ID'),
        'aws_secret_access_key': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'aws_region':            os.getenv('AWS_REGION', 'us-east-1'),
        'bucket_name':           os.getenv('BUCKET_NAME'),
        'prefix_ts':             os.getenv('S3_PATH_PROCESS_TIMESERIES'),
        'prefix_output':         os.getenv('S3_PATH_INTERIM'),
        'raw_ts_col':            os.getenv('TS_COL_NAME', 'timestamp'),
        'ts_format':             os.getenv('TS_FORMAT', 'ddMMyyyy HH:mm:ss'),
        'no_sec_regex':          r'^\d{8} \d{2}:\d{2}$',
        'endpoint':              os.getenv('S3_ENDPOINT', 's3.amazonaws.com')
    }

def create_spark_session() -> SparkSession:
    """
    Crea y retorna una SparkSession configurada para S3A.
    """
    env = get_env_vars()
    spark = (
        SparkSession.builder
        .appName('EDA_Process')
        .config('spark.hadoop.fs.s3a.access.key', env['aws_access_key_id'])
        .config('spark.hadoop.fs.s3a.secret.key', env['aws_secret_access_key'])
        .config('spark.hadoop.fs.s3a.endpoint', f"http://{env['endpoint']}")
        .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .config('spark.hadoop.fs.s3a.path.style.access', 'true')
        .getOrCreate()
    )
    return spark

env_aux=get_env_vars()

In [4]:
spark = create_spark_session()
spark.sparkContext.setLogLevel('ERROR')

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/23 18:08:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/23 18:08:11 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
process_path = f's3a://{env_aux["bucket_name"]}/{env_aux["prefix_output"]}/process_labeled.parquet'
labels_path = f's3a://{env_aux["bucket_name"]}/{env_aux["prefix_output"]}/laboratory_labeled.parquet'

df_process = spark.read.parquet(process_path)
df_labels = spark.read.parquet(labels_path)

                                                                                

In [19]:


def null_nan_summary(df, name):
    print(f"\nResumen de nulos y NaNs en {name}:")

    numeric_cols = [c for c, t in df.dtypes if t in ("double", "float", "int", "bigint")]
    other_cols = [c for c in df.columns if c not in numeric_cols]

    # Conteo de nulos/NaNs por columna
    summary = df.select(
        *[count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in numeric_cols],
        *[count(when(col(c).isNull(), c)).alias(c) for c in other_cols]
    )

    # Mostrar solo columnas con más de 0 nulos/NaNs
    result = summary.collect()[0].asDict()
    result = {k: v for k, v in result.items() if v > 0}

    if result:
        for col_name, cnt in result.items():
            print(f" - {col_name}: {cnt}")
    else:
        print("✅ No hay nulos ni NaNs")

# Uso
null_nan_summary(df_process, "Process")
null_nan_summary(df_labels, "Labels")



Resumen de nulos y NaNs en Process:
✅ No hay nulos ni NaNs

Resumen de nulos y NaNs en Labels:
 - api_content: 2
 - tbl_min_weight: 10
 - tbl_max_weight: 10
 - api_total_impurities: 5
 - api_l_impurity: 9


In [14]:
df_process

DataFrame[batch: bigint, code: bigint, tbl_speed_mean: double, tbl_speed_change: double, tbl_speed_0_duration: double, total_waste: double, startup_waste: bigint, weekend: string, fom_mean: double, fom_change: bigint, SREL_startup_mean: double, SREL_production_mean: double, SREL_production_max: double, main_CompForce mean: double, main_CompForce_sd: double, main_CompForce_median: double, pre_CompForce_mean: double, tbl_fill_mean: double, tbl_fill_sd: double, cyl_height_mean: double, stiffness_mean: double, stiffness_max: bigint, stiffness_min: bigint, ejection_mean: double, ejection_max: bigint, ejection_min: bigint, Startup_tbl_fill_maxDifference: double, Startup_main_CompForce_mean: double, Startup_tbl_fill_mean: double, Drug release average (%): string, Drug release min (%): string, Residual solvent: string, Total impurities: string, Impurity O: string, Impurity L: string, batch_size: bigint, startup_waste_norm: double, is_anomalous: int]

In [18]:
df_process.toPandas()

Unnamed: 0,batch,code,tbl_speed_mean,tbl_speed_change,tbl_speed_0_duration,total_waste,startup_waste,weekend,fom_mean,fom_change,...,Startup_tbl_fill_mean,Drug release average (%),Drug release min (%),Residual solvent,Total impurities,Impurity O,Impurity L,batch_size,startup_waste_norm,is_anomalous
0,914,13,119.970711,0.937500,16.562500,174.843750,2967,no,99.988875,13,...,5.301538,92.5,89,0.02,0.06,0.05,0.05,1920000,0.001545,0
1,915,13,119.946829,0.833333,15.052083,331.614583,5985,no,99.968560,14,...,5.255238,94.83,88,0.02,0.06,0.05,0.05,1920000,0.003117,0
2,916,13,119.997715,0.520833,20.468750,517.864583,9594,no,99.991280,9,...,5.454242,93,81,0.02,0.05,0.05,0.05,1920000,0.004997,0
3,917,13,119.959698,2.395833,46.718750,772.864583,9480,no,99.987691,19,...,5.622121,89.33,82,0.02,0.11,0.05,0.05,1920000,0.004938,0
4,918,17,99.987616,2.083333,39.166667,724.270833,6884,no,79.981691,15,...,6.191538,91.17,87,0.05,0.06,0.05,0.05,960000,0.007171,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,328,4,99.948145,5.488851,68.267581,3772.898799,21953,no,58.833254,36,...,8.321220,85.5,82,0.08,0.06,0.05,0.06,583000,0.037655,1
1001,329,4,99.966110,5.317324,82.504288,3090.566038,17951,no,59.704198,10,...,7.946190,91.5,89,0.1,0.05,0.05,0.05,583000,0.030791,0
1002,330,4,99.980744,3.430532,46.826758,759.348199,3862,no,59.979514,9,...,7.930588,88,78,0.1,0.06,0.05,0.06,583000,0.006624,0
1003,331,4,99.998904,1.715266,30.703259,560.720412,3008,no,59.939106,9,...,7.858571,89.17,84,0.14,0.07,0.05,0.07,583000,0.005160,0


In [30]:
numeric_cols = [name for name, dtype in df_process.dtypes if dtype in ("double", "float", "int", "bigint")]
categorical_cols = [name for name, dtype in df_process.dtypes if name not in numeric_cols]

print("\n📌 Resumen de columnas categóricas (top 10 si son muchas):")
for c in categorical_cols:
    distinct_count = df_process.select(c).distinct().count()
    print(f"\n{c} → {distinct_count} valores únicos")
    if distinct_count <= 15:
        values = [row[c] for row in df_process.select(c).distinct().collect()]
        print("Valores:", values)
    else:
        top_values = df_process.groupBy(c).count().orderBy(F.desc("count")).limit(10).toPandas()
        print("Top 10 más frecuentes:\n", top_values)




📌 Resumen de columnas categóricas (top 10 si son muchas):

weekend → 2 valores únicos
Valores: ['no', 'yes']

Drug release average (%) → 128 valores únicos
Top 10 más frecuentes:
   Drug release average (%)  count
0                    90.83     23
1                     89.5     23
2                    89.17     23
3                    92.33     22
4                    88.17     22
5                    88.83     22
6                    89.67     21
7                    90.33     21
8                    90.67     20
9                     88.5     20

Drug release min (%) → 28 valores únicos
Top 10 más frecuentes:
   Drug release min (%)  count
0                   85    110
1                   88     88
2                   82     86
3                   84     83
4                   86     82
5                   83     79
6                   87     68
7                   81     58
8                   89     53
9                   80     51

Residual solvent → 24 valores únicos
Top 10 más 

In [21]:
print("\n📌 Resumen de columnas numéricas (num valores únicos + ejemplos):")
summary_unique = []
for c in numeric_cols:
    distinct_count = df_process.select(c).distinct().count()
    sample_values = [row[c] for row in df_process.select(c).distinct().orderBy(c).limit(5).collect()]
    summary_unique.append((c, distinct_count, sample_values))


📌 Resumen de columnas numéricas (num valores únicos + ejemplos):


                                                                                

In [24]:
print("\nTipos de columnas en df_process:")
for col_name, dtype in df_process.dtypes:
    print(f" - {col_name}: {dtype}")

print("\nTipos de columnas en df_labels:")
for col_name, dtype in df_labels.dtypes:
    print(f" - {col_name}: {dtype}")




Tipos de columnas en df_process:
 - batch: bigint
 - code: bigint
 - tbl_speed_mean: double
 - tbl_speed_change: double
 - tbl_speed_0_duration: double
 - total_waste: double
 - startup_waste: bigint
 - weekend: string
 - fom_mean: double
 - fom_change: bigint
 - SREL_startup_mean: double
 - SREL_production_mean: double
 - SREL_production_max: double
 - main_CompForce mean: double
 - main_CompForce_sd: double
 - main_CompForce_median: double
 - pre_CompForce_mean: double
 - tbl_fill_mean: double
 - tbl_fill_sd: double
 - cyl_height_mean: double
 - stiffness_mean: double
 - stiffness_max: bigint
 - stiffness_min: bigint
 - ejection_mean: double
 - ejection_max: bigint
 - ejection_min: bigint
 - Startup_tbl_fill_maxDifference: double
 - Startup_main_CompForce_mean: double
 - Startup_tbl_fill_mean: double
 - Drug release average (%): string
 - Drug release min (%): string
 - Residual solvent: string
 - Total impurities: string
 - Impurity O: string
 - Impurity L: string
 - batch_size: bi

In [28]:
df_process.dtypes

[('batch', 'bigint'),
 ('code', 'bigint'),
 ('tbl_speed_mean', 'double'),
 ('tbl_speed_change', 'double'),
 ('tbl_speed_0_duration', 'double'),
 ('total_waste', 'double'),
 ('startup_waste', 'bigint'),
 ('weekend', 'string'),
 ('fom_mean', 'double'),
 ('fom_change', 'bigint'),
 ('SREL_startup_mean', 'double'),
 ('SREL_production_mean', 'double'),
 ('SREL_production_max', 'double'),
 ('main_CompForce mean', 'double'),
 ('main_CompForce_sd', 'double'),
 ('main_CompForce_median', 'double'),
 ('pre_CompForce_mean', 'double'),
 ('tbl_fill_mean', 'double'),
 ('tbl_fill_sd', 'double'),
 ('cyl_height_mean', 'double'),
 ('stiffness_mean', 'double'),
 ('stiffness_max', 'bigint'),
 ('stiffness_min', 'bigint'),
 ('ejection_mean', 'double'),
 ('ejection_max', 'bigint'),
 ('ejection_min', 'bigint'),
 ('Startup_tbl_fill_maxDifference', 'double'),
 ('Startup_main_CompForce_mean', 'double'),
 ('Startup_tbl_fill_mean', 'double'),
 ('Drug release average (%)', 'string'),
 ('Drug release min (%)', 'string