In [None]:
import findspark
findspark.init('/spark-3.5.1-bin-hadoop3')
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, col
from pyspark.sql.types import StructType, StructField, FloatType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
import time



spark = SparkSession.builder.appName("analytics").config("spark.driver.memory", "2g").getOrCreate()


In [None]:




# Definimos el esquema del dataset simulado
schema = StructType([
    StructField("age", FloatType(), True),
    StructField("creatinine_phosphokinase", FloatType(), True),
    StructField("ejection_fraction", FloatType(), True),
    StructField("platelets", FloatType(), True),
    StructField("serum_creatinine", FloatType(), True),
    StructField("serum_sodium", FloatType(), True),
    StructField("time", FloatType(), True)
])

# Simulamos un stream de datos creando un DataFrame con valores aleatorios
def generate_stream_data():
    return spark.range(0, 100).select(
        (rand() * 100).cast("float").alias("age"),
        (rand() * 8000).cast("float").alias("creatinine_phosphokinase"),
        (rand() * 80).cast("float").alias("ejection_fraction"),
        (rand() * 450000).cast("float").alias("platelets"),
        (rand() * 5).cast("float").alias("serum_creatinine"),
        (rand() * 150).cast("float").alias("serum_sodium"),
        (rand() * 300).cast("float").alias("time")
    )

# Generamos datos de entrenamiento iniciales
initial_data = generate_stream_data()




In [None]:
# Seleccionamos las características que vamos a utilizar para el clustering
features = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']

# Vectorizamos las características seleccionadas
vector_assembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")
initial_data_vector = vector_assembler.transform(initial_data)

# Escalamos las características
scaler = StandardScaler(inputCol="unscaled_features", outputCol="features")
scaler_model = scaler.fit(initial_data_vector)
initial_data_scaled = scaler_model.transform(initial_data_vector)



# Generar datos para simular streaming para el otro notebook

In [None]:


# Simulamos la generación de datos de stream y los escribimos a una carpeta
for i in range(5):
    stream_data = generate_stream_data()
    stream_data.write.mode("append").parquet("data/simulated_stream")
    print("Wrote data to the parquet file")
    time.sleep(30)

