In [15]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import logging
import os

In [16]:
HDFS_PATH = "hdfs://localhost:8020"
HDFS_FORMAT = "parquet"

PG_URL = "jdbc:postgresql://localhost:5432/database"
PG_USER = "username"
PG_PASS = "password"

KAFKA_BOOTSTRAP = "localhost:9092"
KAFKA_TOPIC = "chartevents"

ROW_PER_SECOND = 5

SUBJECT_ID = 10017531

In [17]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def setup_spark():
    os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-17-openjdk-amd64'

    """Configure Spark session for better performance"""
    return SparkSession.builder \
        .appName("DrugRecommendationModel") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.sql.adaptive.skew.enabled", "true") \
        .getOrCreate()

def validate_dataframe(df, name):
    """Validate DataFrame for nulls and empty conditions"""
    count = df.count()
    if count == 0:
        raise ValueError(f"DataFrame {name} is empty!")
    
    null_counts = [(column_name, df.filter(col(column_name).isNull()).count()) 
                   for column_name in df.columns if df.filter(col(column_name).isNull()).count() > 0]
    
    if null_counts:
        logger.warning(f"Null counts in {name}: {dict(null_counts)}")
    
    logger.info(f"✓ {name} loaded with {count:,} records")
    return count

In [18]:
spark = setup_spark()

print("Reading Parquet files from HDFS...")

# Read Parquet files (much more efficient than CSV
chartevents = spark.read.parquet(HDFS_PATH + "/data/chartevents.parquet")
d_items = spark.read.parquet(HDFS_PATH + "/data/d_items.parquet")
prescriptions = spark.read.parquet(HDFS_PATH + "/data/prescriptions.parquet")
icustays = spark.read.parquet(HDFS_PATH + "/data/icustays.parquet")

# Validate all datasets
validate_dataframe(chartevents, "chartevents")
validate_dataframe(d_items, "d_items")
validate_dataframe(prescriptions, "prescriptions")
validate_dataframe(icustays, "icustays")

print("✓ All Parquet files loaded successfully!")


Reading Parquet files from HDFS...


INFO:__main__:✓ chartevents loaded with 3,550,000 records
INFO:__main__:✓ d_items loaded with 4,095 records
INFO:__main__:✓ prescriptions loaded with 7,900,000 records
INFO:__main__:✓ icustays loaded with 94,458 records


✓ All Parquet files loaded successfully!


In [19]:
# Kết hợp chartevents với d_items để lấy tên chỉ số
chartevents_with_names = chartevents.join(
    d_items.hint("broadcast"), "itemid", "left"
).repartition(200, "stay_id")

# Lọc các chỉ số quan tâm
target_items = [
    220045,                # Heart Rate
    220050, 220051,        # Blood Pressure
    220210,                # Respiratory Rate
    220277,                # Oxygen Saturation
    223762,                # Temperature
]

filtered_charts = chartevents_with_names.filter(
    col("itemid").isin(target_items)
).filter(
    col("stay_id").isNotNull() & 
    col("valuenum").isNotNull()
)

# Xử lý giá trị số và lọc outlier cơ bản
filtered_charts = filtered_charts.withColumn(
    "valuenum_double", 
    col("valuenum").cast("double")
).filter(
    (col("valuenum_double") > 0) & 
    (col("valuenum_double") < 1000)  # Filter extreme outliers
)

abnormal_charts = filtered_charts.withColumn(
    "is_abnormal",
    when(
        (col("itemid") == 220045) & ((col("valuenum_double") < 60) | (col("valuenum_double") > 100)), 1
    ).when(
        (col("itemid") == 220050) & ((col("valuenum_double") < 90) | (col("valuenum_double") > 140)), 1  # Systolic
    ).when(
        (col("itemid") == 220051) & ((col("valuenum_double") < 60) | (col("valuenum_double") > 90)), 1   # Diastolic
    ).when(
        (col("itemid") == 220277) & (col("valuenum_double") < 90), 1
    ).when(
        (col("itemid") == 220210) & ((col("valuenum_double") < 12) | (col("valuenum_double") > 20)), 1
    ).when(
        (col("itemid") == 223762) & ((col("valuenum_double") < 36) | (col("valuenum_double") > 37.8)), 1
    ).otherwise(0)
)

validate_dataframe(abnormal_charts, "abnormal_charts")

NameError: name 'abnormal_charts' is not defined

In [None]:
# Kết hợp prescriptions với icustays với xử lý thời gian
icu_prescriptions = prescriptions.join(
    broadcast(icustays), ["subject_id", "hadm_id"], "inner"
).filter(
    col("stay_id").isNotNull() &
    col("drug").isNotNull()
).withColumn(
    "drug_hour", hour(col("starttime"))
).repartition(200, "stay_id")

validate_dataframe(icu_prescriptions, "icu_prescriptions")


25/11/03 11:20:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/11/03 11:21:09 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_1807_181 in memory.
25/11/03 11:21:09 WARN MemoryStore: Not enough space to cache rdd_1807_181 in memory! (computed 504.0 B so far)
25/11/03 11:21:09 WARN MemoryStore: Not enough space to cache rdd_1807_180 in memory! (computed 1160.6 KiB so far)
25/11/03 11:21:09 WARN BlockManager: Persisting block rdd_1807_181 to disk instead.
25/11/03 11:21:09 WARN BlockManager: Persisting block rdd_1807_180 to disk instead.
25/11/03 11:21:09 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_1807_180 in memory.
25/11/03 11:21:09 WARN MemoryStore: Not enough space to cache rdd_1807_177 in memory! (computed 2.6 MiB so far)
25/11/03 11:21:09 WARN BlockMa

4208526

In [None]:
# Gom nhóm các chỉ số bất thường theo stay_id
abnormal_summary = abnormal_charts.filter(col("is_abnormal") == 1)\
    .groupBy("stay_id")\
    .agg(
        collect_list(struct("itemid", "valuenum_double", "charttime")).alias("abnormal_signals"),
        count("itemid").alias("total_abnormal_count"),
        countDistinct("itemid").alias("unique_abnormal_types"),
        avg("valuenum_double").alias("avg_abnormal_value"),
        min("valuenum_double").alias("min_abnormal_value"),
        max("valuenum_double").alias("max_abnormal_value")
    )

# Lấy danh sách thuốc cho từng ICU stay với thông tin phong phú hơn
icu_meds = icu_prescriptions.groupBy("stay_id")\
    .agg(
        collect_list("drug").alias("prescribed_drugs"),
        count("drug").alias("total_prescriptions"),
        countDistinct("drug").alias("unique_drugs"),
        collect_set("drug_hour").alias("prescription_hours")
    )

# Tạo dataset huấn luyện với validation
dataset = abnormal_summary.join(icu_meds, "stay_id", "inner")

NameError: name 'countDistinct' is not defined

In [None]:
initial_count = validate_dataframe(dataset, "training_dataset")

# Feature Engineering nâng cao
print("Starting advanced feature engineering...")

# 1. Tạo binary features cho từng loại bất thường cụ thể
feature_columns = []
abnormal_types = [
    (220045, "hr_abnormal"),
    (220050, "bp_sys_abnormal"), 
    (220051, "bp_dia_abnormal"),
    (220277, "spo2_abnormal"),
    (220210, "rr_abnormal"),
    (223762, "temp_abnormal")
]

features = dataset
for item_id, col_name in abnormal_types:
    features = features.withColumn(
        col_name, 
        expr(f"exists(abnormal_signals, x -> x.itemid = {item_id})::int")
    )
    feature_columns.append(col_name)

# 2. Tạo composite features
features = features.withColumn(
    "bp_abnormal",
    expr("(bp_sys_abnormal = 1 OR bp_dia_abnormal = 1)::int")
)
feature_columns.append("bp_abnormal")

# 3. Thêm numerical features
features = features.withColumn(
    "abnormal_count_ratio",
    col("total_abnormal_count") / col("unique_abnormal_types")
)
feature_columns.extend(["total_abnormal_count", "unique_abnormal_types", "abnormal_count_ratio"])

# 4. Xử lý drugs với CountVectorizer thông minh
print("Processing drug prescriptions...")

# Lấy top drugs với filter theo tần suất
drug_stats = prescriptions.filter(col("drug").isNotNull())\
    .groupBy("drug")\
    .agg(
        count("*").alias("drug_count"),
        countDistinct("stay_id").alias("unique_patients")
    ).filter(
        (col("drug_count") >= 10) &  # Minimum frequency
        (col("unique_patients") >= 5)  # Minimum unique patients
    ).orderBy(col("drug_count").desc())

top_drugs = [row.drug for row in drug_stats.limit(50).collect()]
logger.info(f"Selected {len(top_drugs)} drugs for modeling")

# Tạo drug list cho CountVectorizer
drugs_for_modeling = features.select(
    "stay_id", 
    expr("filter(prescribed_drugs, x -> x IS NOT NULL)").alias("drugs_list")
).filter(
    size(col("drugs_list")) > 0
)

# Sử dụng CountVectorizer để tạo drug features
drug_vectorizer = CountVectorizer(
    inputCol="drugs_list", 
    outputCol="drug_features",
    vocabSize=30,  # Top 30 drugs
    minDF=5.0      # Minimum document frequency
)

# 5. Tạo features vector cho vital signs
feature_assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="clinical_features"
)

# 6. Pipeline hoàn chỉnh
final_pipeline = Pipeline(stages=[
    feature_assembler,
    drug_vectorizer
])

# Fit và transform dữ liệu
pipeline_model = final_pipeline.fit(features)
processed_data = pipeline_model.transform(features)

final_count = validate_dataframe(processed_data, "processed_data")

print("✓ Feature engineering completed successfully!")

                                                                                

In [None]:
# Phân tích dữ liệu chi tiết
print("\n" + "="*50)
print("DATA ANALYSIS REPORT")
print("="*50)

# Phân phối các features bất thường
print("\n1. Phân phối các features bất thường:")
for col_name in ["hr_abnormal", "bp_abnormal", "spo2_abnormal", "rr_abnormal", "temp_abnormal"]:
    if col_name in processed_data.columns:
        print(f"\n{col_name}:")
        processed_data.groupBy(col_name).count().orderBy(col_name).show()

# Phân tích drugs
print("\n2. Top drugs được sử dụng:")
drug_vocab = pipeline_model.stages[1].vocabulary
for i, drug in enumerate(drug_vocab[:15]):
    count = processed_data.filter(
        array_contains(col("drugs_list"), drug)
    ).count()
    percentage = (count / final_count) * 100
    print(f"  {i+1:2d}. {drug:<30} {count:>5} patients ({percentage:5.1f}%)")

# Phân tích tương quan features
print("\n3. Tổng quan dataset:")
print(f"  - Tổng số ICU stays: {final_count:,}")
print(f"  - Số lượng features lâm sàng: {len(feature_columns)}")
print(f"  - Số lượng drugs trong model: {len(drug_vocab)}")
print(f"  - Tỷ lệ data retention: {(final_count/initial_count)*100:.1f}%")

# Phân tích đa nhiệm vụ (multi-label)
print("\n4. Phân tích đa nhiệm vụ:")
avg_drugs_per_patient = processed_data.select(
    avg(size(col("drugs_list"))).alias("avg_drugs")
).collect()[0]["avg_drugs"]
print(f"  - Số thuốc trung bình mỗi bệnh nhân: {avg_drugs_per_patient:.1f}")

patients_with_multiple_abnormalities = processed_data.filter(
    col("unique_abnormal_types") >= 2
).count()
print(f"  - Bệnh nhân có ≥2 loại bất thường: {patients_with_multiple_abnormalities} ({patients_with_multiple_abnormalities/final_count*100:.1f}%)")

# Hiển thị schema và sample data
print("\n5. Data Schema:")
processed_data.printSchema()

print("\n6. Sample data (clinical features + drug features):")
sample_data = processed_data.select(
    "stay_id", 
    "clinical_features", 
    "drug_features",
    "total_abnormal_count",
    "unique_abnormal_types",
    slice(col("drugs_list"), 1, 3).alias("sample_drugs")
).limit(10)

sample_data.show(truncate=False)

# Chuẩn bị cho modeling
print("\n7. Chuẩn bị cho Modeling:")
print("   ✓ Features clinical: Vector với các chỉ số bất thường")
print("   ✓ Features drugs: Vector đa nhãn với các thuốc được kê")
print("   ✓ Dataset sẵn sàng cho các mô hình recommendation")

print("\n" + "="*50)
print("PREPROCESSING COMPLETED SUCCESSFULLY!")
print("="*50)

In [None]:
# Stop Spark session
spark.stop()
print("✓ Spark session stopped")
print("=== NOTEBOOK EXECUTION COMPLETED ===")

✓ Spark session stopped
=== NOTEBOOK EXECUTION COMPLETED ===
