In [1]:
import pandas as pd

In [2]:
from dotenv import load_dotenv
import os
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt


load_dotenv()

True

In [3]:
def get_env_vars() -> dict:
    """
    Obtiene las variables de entorno al momento de la ejecución,
    para soportar cambios dinámicos (ej. en tests).
    """
    return {
        'aws_access_key_id':     os.getenv('AWS_ACCESS_KEY_ID'),
        'aws_secret_access_key': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'aws_region':            os.getenv('AWS_REGION', 'us-east-1'),
        'bucket_name':           os.getenv('BUCKET_NAME'),
        'prefix_ts':             os.getenv('S3_PATH_PROCESS_TIMESERIES'),
        'prefix_output':         os.getenv('S3_PATH_INTERIM'),
        'raw_ts_col':            os.getenv('TS_COL_NAME', 'timestamp'),
        'ts_format':             os.getenv('TS_FORMAT', 'ddMMyyyy HH:mm:ss'),
        'no_sec_regex':          r'^\d{8} \d{2}:\d{2}$',
        'endpoint':              os.getenv('S3_ENDPOINT', 's3.amazonaws.com')
    }

def create_spark_session() -> SparkSession:
    """
    Crea y retorna una SparkSession configurada para S3A.
    """
    env = get_env_vars()
    spark = (
        SparkSession.builder
        .appName('EDA_Process')
        .config('spark.hadoop.fs.s3a.access.key', env['aws_access_key_id'])
        .config('spark.hadoop.fs.s3a.secret.key', env['aws_secret_access_key'])
        .config('spark.hadoop.fs.s3a.endpoint', f"http://{env['endpoint']}")
        .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
        .config('spark.hadoop.fs.s3a.path.style.access', 'true')
        .getOrCreate()
    )
    return spark

In [4]:
env_aux=get_env_vars()

In [5]:
spark = create_spark_session()
spark.sparkContext.setLogLevel('ERROR')

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/23 17:29:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
s3_path_lab = f's3a://{env_aux["bucket_name"]}/{env_aux["prefix_output"]}/laboratory_labeled.parquet'
s3_path_lab

's3a://anomaly-pharma-bucket/data/interim/laboratory_labeled.parquet'

In [7]:
s3_path_proc = f's3a://{env_aux["bucket_name"]}/{env_aux["prefix_output"]}/process.parquet'
s3_path_proc

's3a://anomaly-pharma-bucket/data/interim/process.parquet'

In [8]:
s3_path_norm = f's3a://{env_aux["bucket_name"]}/{env_aux["prefix_output"]}/normalization.parquet'
s3_path_norm

's3a://anomaly-pharma-bucket/data/interim/normalization.parquet'

In [9]:
df_lab = spark.read.parquet(s3_path_lab)
df_proc = spark.read.parquet(s3_path_proc)
df_norm = spark.read.parquet(s3_path_norm)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

In [10]:
print("Esquema de Process:")
df_proc.printSchema()


Esquema de Process:
root
 |-- batch: integer (nullable = true)
 |-- code: integer (nullable = true)
 |-- tbl_speed_mean: double (nullable = true)
 |-- tbl_speed_change: double (nullable = true)
 |-- tbl_speed_0_duration: double (nullable = true)
 |-- total_waste: double (nullable = true)
 |-- startup_waste: integer (nullable = true)
 |-- weekend: string (nullable = true)
 |-- fom_mean: double (nullable = true)
 |-- fom_change: integer (nullable = true)
 |-- SREL_startup_mean: double (nullable = true)
 |-- SREL_production_mean: double (nullable = true)
 |-- SREL_production_max: double (nullable = true)
 |-- main_CompForce mean: double (nullable = true)
 |-- main_CompForce_sd: double (nullable = true)
 |-- main_CompForce_median: double (nullable = true)
 |-- pre_CompForce_mean: double (nullable = true)
 |-- tbl_fill_mean: double (nullable = true)
 |-- tbl_fill_sd: double (nullable = true)
 |-- cyl_height_mean: double (nullable = true)
 |-- stiffness_mean: double (nullable = true)
 |-- st

In [11]:
print("Esquema de Normalization:")
df_norm.printSchema()


Esquema de Normalization:
root
 |-- code: integer (nullable = true)
 |-- batch: integer (nullable = true)
 |-- normalisation_factor: double (nullable = true)



In [12]:
print("Esquema de Laboratory Labeled:")
df_lab.printSchema()

Esquema de Laboratory Labeled:
root
 |-- batch: integer (nullable = true)
 |-- code: integer (nullable = true)
 |-- strength: string (nullable = true)
 |-- size: integer (nullable = true)
 |-- start: string (nullable = true)
 |-- api_code: integer (nullable = true)
 |-- api_batch: integer (nullable = true)
 |-- smcc_batch: integer (nullable = true)
 |-- lactose_batch: integer (nullable = true)
 |-- starch_batch: integer (nullable = true)
 |-- api_water: string (nullable = true)
 |-- api_total_impurities: string (nullable = true)
 |-- api_l_impurity: string (nullable = true)
 |-- api_content: double (nullable = true)
 |-- api_ps01: string (nullable = true)
 |-- api_ps05: string (nullable = true)
 |-- api_ps09: string (nullable = true)
 |-- lactose_water: double (nullable = true)
 |-- lactose_sieve0045: integer (nullable = true)
 |-- lactose_sieve015: integer (nullable = true)
 |-- lactose_sieve025: integer (nullable = true)
 |-- smcc_water: double (nullable = true)
 |-- smcc_td: double 

In [13]:
df_lab_pd = df_lab.toPandas()
df_proc_pd = df_proc.toPandas()
df_norm_pd = df_norm.toPandas()

                                                                                

In [14]:
df_norm_pd

Unnamed: 0,code,batch,normalisation_factor
0,1,240000,2.4
1,2,1920000,19.2
2,3,960000,9.6
3,4,583000,5.83
4,5,2400000,24.0
5,6,2400000,24.0
6,7,1200000,12.0
7,8,1100000,11.0
8,9,240000,2.4
9,10,960000,9.6


In [15]:

# 2. Une el tamaño de lote (Batch Size) con startup_waste
df_norm_pd = df_norm_pd.rename(columns={"batch": "batch_size"})
df_aux = df_proc_pd.merge(df_norm_pd[["code", "batch_size"]], on="code", how="left")

# 3. Calcula correlación
corr = df_aux["startup_waste"].corr(df_aux["batch_size"])
print(f"Correlación startup_waste vs. batch size: {corr:.2f}")

Correlación startup_waste vs. batch size: 0.28


In [16]:
df_proc_pd.columns

Index(['batch', 'code', 'tbl_speed_mean', 'tbl_speed_change',
       'tbl_speed_0_duration', 'total_waste', 'startup_waste', 'weekend',
       'fom_mean', 'fom_change', 'SREL_startup_mean', 'SREL_production_mean',
       'SREL_production_max', 'main_CompForce mean', 'main_CompForce_sd',
       'main_CompForce_median', 'pre_CompForce_mean', 'tbl_fill_mean',
       'tbl_fill_sd', 'cyl_height_mean', 'stiffness_mean', 'stiffness_max',
       'stiffness_min', 'ejection_mean', 'ejection_max', 'ejection_min',
       'Startup_tbl_fill_maxDifference', 'Startup_main_CompForce_mean',
       'Startup_tbl_fill_mean', 'Drug release average (%)',
       'Drug release min (%)', 'Residual solvent', 'Total impurities',
       'Impurity O', 'Impurity L'],
      dtype='object')

In [17]:
df_aux

Unnamed: 0,batch,code,tbl_speed_mean,tbl_speed_change,tbl_speed_0_duration,total_waste,startup_waste,weekend,fom_mean,fom_change,...,Startup_tbl_fill_maxDifference,Startup_main_CompForce_mean,Startup_tbl_fill_mean,Drug release average (%),Drug release min (%),Residual solvent,Total impurities,Impurity O,Impurity L,batch_size
0,1,25,99.864656,5.416667,149.583333,2125.416667,5085,no,49.961446,12,...,0.38,4.587500,5.466667,93.83,86,0.06,0.33,0.05,0.16,240000
1,2,25,99.936342,2.500000,128.333333,887.500000,2115,no,49.962040,5,...,0.18,4.390909,5.315455,99.67,92,0.04,0.34,0.06,0.16,240000
2,3,25,99.985984,2.500000,83.333333,796.250000,1895,no,49.961176,6,...,0.12,4.430000,5.242000,97.33,92,0.03,0.28,0.05,0.16,240000
3,4,25,99.976868,2.916667,76.250000,695.833333,1645,no,49.960900,9,...,0.24,4.500000,5.221250,94.5,89,0.03,0.3,0.05,0.18,240000
4,5,25,99.968284,2.500000,121.250000,829.166667,1971,no,50.000000,5,...,0.19,3.960000,5.233000,92,88,0.04,0.31,0.05,0.18,240000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,119.973595,1.458333,43.125000,972.395833,3564,no,79.955846,8,...,0.20,6.333333,6.523333,,,,,,,960000
1001,1002,17,119.941321,2.604167,43.125000,1055.625000,4090,no,80.000000,8,...,0.13,5.987500,6.438125,,,,,,,960000
1002,1003,17,119.893113,3.958333,42.812500,1456.875000,6950,no,79.978230,9,...,0.38,6.134783,6.381739,,,,,,,960000
1003,1004,17,120.000000,1.145833,36.041667,404.687500,1793,no,79.964059,9,...,0.22,5.887500,6.532500,,,,,,,960000


In [18]:
df_aux["startup_waste_norm"] = df_aux["startup_waste"] / df_aux["batch_size"]
df_aux

Unnamed: 0,batch,code,tbl_speed_mean,tbl_speed_change,tbl_speed_0_duration,total_waste,startup_waste,weekend,fom_mean,fom_change,...,Startup_main_CompForce_mean,Startup_tbl_fill_mean,Drug release average (%),Drug release min (%),Residual solvent,Total impurities,Impurity O,Impurity L,batch_size,startup_waste_norm
0,1,25,99.864656,5.416667,149.583333,2125.416667,5085,no,49.961446,12,...,4.587500,5.466667,93.83,86,0.06,0.33,0.05,0.16,240000,0.021188
1,2,25,99.936342,2.500000,128.333333,887.500000,2115,no,49.962040,5,...,4.390909,5.315455,99.67,92,0.04,0.34,0.06,0.16,240000,0.008812
2,3,25,99.985984,2.500000,83.333333,796.250000,1895,no,49.961176,6,...,4.430000,5.242000,97.33,92,0.03,0.28,0.05,0.16,240000,0.007896
3,4,25,99.976868,2.916667,76.250000,695.833333,1645,no,49.960900,9,...,4.500000,5.221250,94.5,89,0.03,0.3,0.05,0.18,240000,0.006854
4,5,25,99.968284,2.500000,121.250000,829.166667,1971,no,50.000000,5,...,3.960000,5.233000,92,88,0.04,0.31,0.05,0.18,240000,0.008212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,119.973595,1.458333,43.125000,972.395833,3564,no,79.955846,8,...,6.333333,6.523333,,,,,,,960000,0.003713
1001,1002,17,119.941321,2.604167,43.125000,1055.625000,4090,no,80.000000,8,...,5.987500,6.438125,,,,,,,960000,0.004260
1002,1003,17,119.893113,3.958333,42.812500,1456.875000,6950,no,79.978230,9,...,6.134783,6.381739,,,,,,,960000,0.007240
1003,1004,17,120.000000,1.145833,36.041667,404.687500,1793,no,79.964059,9,...,5.887500,6.532500,,,,,,,960000,0.001868


In [49]:
df_aux.iloc[986,:]

batch                                    987
code                                      23
tbl_speed_mean                    119.944857
tbl_speed_change                    2.395833
tbl_speed_0_duration               38.645833
total_waste                          2158.75
startup_waste                          20702
weekend                                   no
fom_mean                                80.0
fom_change                                16
SREL_startup_mean                    3.93125
SREL_production_mean                3.642098
SREL_production_max                      6.1
main_CompForce mean                 5.460651
main_CompForce_sd                   0.097951
main_CompForce_median                    5.5
pre_CompForce_mean                       0.0
tbl_fill_mean                       6.109309
tbl_fill_sd                         0.085822
cyl_height_mean                     2.025863
stiffness_mean                     86.596383
stiffness_max                             95
stiffness_

In [21]:
df_lab_pd

Unnamed: 0,batch,code,strength,size,start,api_code,api_batch,smcc_batch,lactose_batch,starch_batch,...,dissolution_min,resodual_solvent,impurities_total,impurity_o,impurity_l,dissolution_av_below,api_content_below,api_content_above,impurities_total_above,is_anomalous
0,1,25,5MG,240000,nov.18,5,2,1,2,1,...,86,0.06,0.33,0.05,0.16,0,0,0,0,0
1,2,25,5MG,240000,nov.18,5,2,1,2,1,...,92,0.04,0.34,0.06,0.16,0,0,0,0,0
2,3,25,5MG,240000,nov.18,5,2,1,2,1,...,92,0.03,0.28,0.05,0.16,0,0,0,0,0
3,4,25,5MG,240000,nov.18,5,2,1,2,1,...,89,0.03,0.30,0.05,0.18,0,0,0,0,0
4,5,25,5MG,240000,nov.18,5,2,1,2,1,...,88,0.04,0.31,0.05,0.18,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,20M,960000,apr.21,3,254,18,22,17,...,85,0.02,0.05,0.05,0.05,0,0,0,0,0
1001,1002,17,20M,960000,apr.21,3,254,18,22,17,...,84,0.02,0.05,0.05,0.05,0,0,0,0,0
1002,1003,17,20M,960000,apr.21,3,254,18,22,17,...,87,0.02,0.05,0.05,0.05,0,0,0,0,0
1003,1004,17,20M,960000,apr.21,3,254,18,22,17,...,90,0.02,0.05,0.05,0.05,0,0,0,0,0


In [30]:
df_aux_spk =spark.createDataFrame(df_aux)

In [32]:
df_proc_final = (
    df_aux_spk
    .join(df_lab.select('batch', 'is_anomalous'), on='batch', how='inner')
)

In [34]:
df_proc_final.show(5)

[Stage 13:>                                                         (0 + 1) / 1]

+-----+----+--------------+----------------+--------------------+-----------+-------------+-------+-----------+----------+-----------------+--------------------+-------------------+-------------------+-----------------+---------------------+------------------+-------------+-----------+---------------+--------------+-------------+-------------+-------------+------------+------------+------------------------------+---------------------------+---------------------+------------------------+--------------------+----------------+----------------+----------+----------+----------+--------------------+------------+
|batch|code|tbl_speed_mean|tbl_speed_change|tbl_speed_0_duration|total_waste|startup_waste|weekend|   fom_mean|fom_change|SREL_startup_mean|SREL_production_mean|SREL_production_max|main_CompForce mean|main_CompForce_sd|main_CompForce_median|pre_CompForce_mean|tbl_fill_mean|tbl_fill_sd|cyl_height_mean|stiffness_mean|stiffness_max|stiffness_min|ejection_mean|ejection_max|ejection_min|S

                                                                                

In [35]:
# Recuento antes y después de la unión
print(f"Total registros en Process: {df_proc.count()}")
print(f"Total registros tras unir etiquetas: {df_proc_final.count()}")

# Mostrar ejemplo
df_proc_final.show(5)


Total registros en Process: 1005


                                                                                

Total registros tras unir etiquetas: 1005
+-----+----+--------------+----------------+--------------------+-----------+-------------+-------+-----------+----------+-----------------+--------------------+-------------------+-------------------+-----------------+---------------------+------------------+-------------+-----------+---------------+--------------+-------------+-------------+-------------+------------+------------+------------------------------+---------------------------+---------------------+------------------------+--------------------+----------------+----------------+----------+----------+----------+--------------------+------------+
|batch|code|tbl_speed_mean|tbl_speed_change|tbl_speed_0_duration|total_waste|startup_waste|weekend|   fom_mean|fom_change|SREL_startup_mean|SREL_production_mean|SREL_production_max|main_CompForce mean|main_CompForce_sd|main_CompForce_median|pre_CompForce_mean|tbl_fill_mean|tbl_fill_sd|cyl_height_mean|stiffness_mean|stiffness_max|stiffness_min

In [36]:
output_path=f's3a://{env_aux["bucket_name"]}/{env_aux["prefix_output"]}/process_labeled.parquet'
output_path

's3a://anomaly-pharma-bucket/data/interim/process_labeled.parquet'

In [37]:
df_proc_final.write \
    .mode("overwrite") \
    .parquet(output_path)

                                                                                

In [50]:
df_proc_final.toPandas()

Unnamed: 0,batch,code,tbl_speed_mean,tbl_speed_change,tbl_speed_0_duration,total_waste,startup_waste,weekend,fom_mean,fom_change,...,Startup_tbl_fill_mean,Drug release average (%),Drug release min (%),Residual solvent,Total impurities,Impurity O,Impurity L,batch_size,startup_waste_norm,is_anomalous
0,1,25,99.864656,5.416667,149.583333,2125.416667,5085,no,49.961446,12,...,5.466667,93.83,86,0.06,0.33,0.05,0.16,240000,0.021188,0
1,2,25,99.936342,2.500000,128.333333,887.500000,2115,no,49.962040,5,...,5.315455,99.67,92,0.04,0.34,0.06,0.16,240000,0.008812,0
2,3,25,99.985984,2.500000,83.333333,796.250000,1895,no,49.961176,6,...,5.242000,97.33,92,0.03,0.28,0.05,0.16,240000,0.007896,0
3,4,25,99.976868,2.916667,76.250000,695.833333,1645,no,49.960900,9,...,5.221250,94.5,89,0.03,0.3,0.05,0.18,240000,0.006854,0
4,5,25,99.968284,2.500000,121.250000,829.166667,1971,no,50.000000,5,...,5.233000,92,88,0.04,0.31,0.05,0.18,240000,0.008212,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,1001,17,119.973595,1.458333,43.125000,972.395833,3564,no,79.955846,8,...,6.523333,,,,,,,960000,0.003713,0
1001,1002,17,119.941321,2.604167,43.125000,1055.625000,4090,no,80.000000,8,...,6.438125,,,,,,,960000,0.004260,0
1002,1003,17,119.893113,3.958333,42.812500,1456.875000,6950,no,79.978230,9,...,6.381739,,,,,,,960000,0.007240,0
1003,1004,17,120.000000,1.145833,36.041667,404.687500,1793,no,79.964059,9,...,6.532500,,,,,,,960000,0.001868,0
