In [1]:
import os
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkDataScienceSample") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.1
Spark UI available at: http://d06357a3e3b7:4040


In [2]:
# Create a pandas DataFrame with parametric categorical features
import pandas as pd
import numpy as np

# Parameters: adjust these to generate different datasets
n_rows = 10_000  # number of rows
num_cat_features = 11  # number of categorical features to generate
min_unique = 3  # minimum number of unique categories per feature
max_unique = 7  # maximum number of unique categories per feature
seed = 42

rng = np.random.default_rng(seed)

# Helper to generate short category labels like V1, V2, ...
def make_labels(prefix, k):
    return [f"{prefix}V{i}" for i in range(1, k+1)]

# Build columns
cols = {}
for i in range(1, num_cat_features + 1):
    k = rng.integers(min_unique, max_unique + 1)
    labels = make_labels(f"c{i}_", k)
    col = rng.choice(labels, size=n_rows)
    cols[f"cat_{i}"] = pd.Categorical(col, categories=labels)

# Create DataFrame
df = pd.DataFrame(cols)

# Show preview and category counts
print(df.shape)
display(df.head())
print('\nValue counts for each feature:')
for c in df.columns:
    print(f"\n{c}:")
    print(df[c].value_counts())


(10000, 11)


Unnamed: 0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,cat_11
0,c1_V3,c2_V3,c3_V3,c4_V1,c5_V1,c6_V3,c7_V1,c8_V6,c9_V4,c10_V1,c11_V3
1,c1_V2,c2_V3,c3_V3,c4_V1,c5_V1,c6_V4,c7_V2,c8_V3,c9_V1,c10_V2,c11_V1
2,c1_V2,c2_V2,c3_V1,c4_V2,c5_V2,c6_V3,c7_V1,c8_V4,c9_V3,c10_V2,c11_V3
3,c1_V2,c2_V4,c3_V2,c4_V3,c5_V6,c6_V4,c7_V2,c8_V6,c9_V2,c10_V3,c11_V3
4,c1_V3,c2_V3,c3_V1,c4_V2,c5_V5,c6_V2,c7_V2,c8_V5,c9_V2,c10_V1,c11_V1



Value counts for each feature:

cat_1:
cat_1
c1_V1    3398
c1_V2    3334
c1_V3    3268
Name: count, dtype: int64

cat_2:
cat_2
c2_V3    2567
c2_V1    2522
c2_V2    2488
c2_V4    2423
Name: count, dtype: int64

cat_3:
cat_3
c3_V4    2553
c3_V3    2500
c3_V1    2485
c3_V2    2462
Name: count, dtype: int64

cat_4:
cat_4
c4_V1    3388
c4_V3    3353
c4_V2    3259
Name: count, dtype: int64

cat_5:
cat_5
c5_V6    1473
c5_V2    1448
c5_V4    1446
c5_V1    1432
c5_V5    1409
c5_V7    1409
c5_V3    1383
Name: count, dtype: int64

cat_6:
cat_6
c6_V3    1483
c6_V4    1450
c6_V2    1430
c6_V6    1428
c6_V5    1427
c6_V7    1415
c6_V1    1367
Name: count, dtype: int64

cat_7:
cat_7
c7_V1    3378
c7_V3    3317
c7_V2    3305
Name: count, dtype: int64

cat_8:
cat_8
c8_V6    1719
c8_V4    1696
c8_V1    1660
c8_V5    1659
c8_V3    1658
c8_V2    1608
Name: count, dtype: int64

cat_9:
cat_9
c9_V3    2519
c9_V4    2513
c9_V2    2494
c9_V1    2474
Name: count, dtype: int64

cat_10:
cat_10
c10_V3    2527
c10

In [3]:
%%time
# Convert pandas categorical columns to string before creating Spark DataFrame
# This avoids Spark inferring pandas Categorical dtype incorrectly.
cat_cols = [c for c in df.columns if pd.api.types.is_categorical_dtype(df[c])]
if cat_cols:
    df[cat_cols] = df[cat_cols].astype(str)

# Create Spark DataFrame from pandas DataFrame
# `spark` is expected to be defined in a previous cell (SparkSession)
spark_df = spark.createDataFrame(df)

# Show schema and first few rows
print("Spark DataFrame schema:")
spark_df.printSchema()
print("\nPreview:")
spark_df.show(10, truncate=False)

# Optional: cache and count to materialize
spark_df = spark_df.cache()
print(f"Row count: {spark_df.count()}")

# Expose for downstream cells
spark_df




Spark DataFrame schema:
root
 |-- cat_1: string (nullable = true)
 |-- cat_2: string (nullable = true)
 |-- cat_3: string (nullable = true)
 |-- cat_4: string (nullable = true)
 |-- cat_5: string (nullable = true)
 |-- cat_6: string (nullable = true)
 |-- cat_7: string (nullable = true)
 |-- cat_8: string (nullable = true)
 |-- cat_9: string (nullable = true)
 |-- cat_10: string (nullable = true)
 |-- cat_11: string (nullable = true)


Preview:
+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+
|cat_1|cat_2|cat_3|cat_4|cat_5|cat_6|cat_7|cat_8|cat_9|cat_10|cat_11|
+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+
|c1_V3|c2_V3|c3_V3|c4_V1|c5_V1|c6_V3|c7_V1|c8_V6|c9_V4|c10_V1|c11_V3|
|c1_V2|c2_V3|c3_V3|c4_V1|c5_V1|c6_V4|c7_V2|c8_V3|c9_V1|c10_V2|c11_V1|
|c1_V2|c2_V2|c3_V1|c4_V2|c5_V2|c6_V3|c7_V1|c8_V4|c9_V3|c10_V2|c11_V3|
|c1_V2|c2_V4|c3_V2|c4_V3|c5_V6|c6_V4|c7_V2|c8_V6|c9_V2|c10_V3|c11_V3|
|c1_V3|c2_V3|c3_V1|c4_V2|c5_V5|c6_V2|c7_V2|c8_V5|c9_V2|c10_V1

DataFrame[cat_1: string, cat_2: string, cat_3: string, cat_4: string, cat_5: string, cat_6: string, cat_7: string, cat_8: string, cat_9: string, cat_10: string, cat_11: string]

In [4]:
%%time
# One-hot encode categorical features using Spark ML and expand OHE vectors into named columns
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Identify categorical columns in spark_df (we converted them to strings earlier)
cat_cols = [c for c, t in spark_df.dtypes if t == 'string']
print('Categorical columns detected:', cat_cols)

# Create indexers and encoders (disable dropLast so we get a column per category)
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid='keep') for c in cat_cols]
encoders = [OneHotEncoder(inputCol=f"{c}_idx", outputCol=f"{c}_ohe", dropLast=False) for c in cat_cols]

# Assemble all OHE vectors into a single feature vector (optional)
ohe_cols = [f"{c}_ohe" for c in cat_cols]
assembler = VectorAssembler(inputCols=ohe_cols, outputCol='features')

pipeline = Pipeline(stages=indexers + encoders + [assembler])
model = pipeline.fit(spark_df)

# Transform
spark_df_ohe = model.transform(spark_df)

# Expand each OHE vector into individual named columns using the original category labels
# Use Spark-native vector_to_array when available to avoid Python pickling issues
try:
    from pyspark.ml.functions import vector_to_array
    native_arr = True
except Exception:
    native_arr = False

# Indexer models are the first len(indexers) stages in the fitted pipeline
indexer_models = model.stages[: len(indexers)]

for i, c in enumerate(cat_cols):
    labels = list(indexer_models[i].labels)
    ohe_col = f"{c}_ohe"
    arr_col = f"{ohe_col}_arr"

    if native_arr:
        spark_df_ohe = spark_df_ohe.withColumn(arr_col, vector_to_array(ohe_col))
    else:
        # fallback UDF that returns Python floats (safe to pickle)
        from pyspark.sql.functions import udf
        from pyspark.sql.types import ArrayType, DoubleType
        from pyspark.ml.linalg import DenseVector

        def vec_to_pylist(v):
            if v is None:
                return None
            return [float(x) for x in v.toArray()]

        vec_to_pylist_udf = udf(vec_to_pylist, ArrayType(DoubleType()))
        spark_df_ohe = spark_df_ohe.withColumn(arr_col, vec_to_pylist_udf(ohe_col))

    # create individual columns named like '<feature>_<category>'
    for j, label in enumerate(labels):
        # sanitize label to a safe column name if necessary
        safe_label = str(label).replace(' ', '_')
        new_col = f"{c}_{safe_label}"
        spark_df_ohe = spark_df_ohe.withColumn(new_col, spark_df_ohe[arr_col].getItem(j).cast('double'))

    # drop the temporary array column
    spark_df_ohe = spark_df_ohe.drop(arr_col)

print('Schema after expanding OHE columns:')
spark_df_ohe.printSchema()
print('\nPreview of expanded OHE columns:')
# show the original categorical cols and the newly created expanded cols (limit 10)
expanded_cols = [f"{c}_{lab}" for c in cat_cols for lab in indexer_models[cat_cols.index(c)].labels]
spark_df_ohe.select(*cat_cols, *expanded_cols).show(10, truncate=False)

# Cache and count to materialize
spark_df_ohe = spark_df_ohe.cache()
print(f"Row count after transform and expansion: {spark_df_ohe.count()}")

# Expose for downstream cells
spark_df_ohe

Categorical columns detected: ['cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7', 'cat_8', 'cat_9', 'cat_10', 'cat_11']
Schema after expanding OHE columns:
root
 |-- cat_1: string (nullable = true)
 |-- cat_2: string (nullable = true)
 |-- cat_3: string (nullable = true)
 |-- cat_4: string (nullable = true)
 |-- cat_5: string (nullable = true)
 |-- cat_6: string (nullable = true)
 |-- cat_7: string (nullable = true)
 |-- cat_8: string (nullable = true)
 |-- cat_9: string (nullable = true)
 |-- cat_10: string (nullable = true)
 |-- cat_11: string (nullable = true)
 |-- cat_1_idx: double (nullable = false)
 |-- cat_2_idx: double (nullable = false)
 |-- cat_3_idx: double (nullable = false)
 |-- cat_4_idx: double (nullable = false)
 |-- cat_5_idx: double (nullable = false)
 |-- cat_6_idx: double (nullable = false)
 |-- cat_7_idx: double (nullable = false)
 |-- cat_8_idx: double (nullable = false)
 |-- cat_9_idx: double (nullable = false)
 |-- cat_10_idx: double (nullable = fals

DataFrame[cat_1: string, cat_2: string, cat_3: string, cat_4: string, cat_5: string, cat_6: string, cat_7: string, cat_8: string, cat_9: string, cat_10: string, cat_11: string, cat_1_idx: double, cat_2_idx: double, cat_3_idx: double, cat_4_idx: double, cat_5_idx: double, cat_6_idx: double, cat_7_idx: double, cat_8_idx: double, cat_9_idx: double, cat_10_idx: double, cat_11_idx: double, cat_1_ohe: vector, cat_2_ohe: vector, cat_3_ohe: vector, cat_4_ohe: vector, cat_5_ohe: vector, cat_6_ohe: vector, cat_7_ohe: vector, cat_8_ohe: vector, cat_9_ohe: vector, cat_10_ohe: vector, cat_11_ohe: vector, features: vector, cat_1_c1_V1: double, cat_1_c1_V2: double, cat_1_c1_V3: double, cat_2_c2_V3: double, cat_2_c2_V1: double, cat_2_c2_V2: double, cat_2_c2_V4: double, cat_3_c3_V4: double, cat_3_c3_V3: double, cat_3_c3_V1: double, cat_3_c3_V2: double, cat_4_c4_V1: double, cat_4_c4_V3: double, cat_4_c4_V2: double, cat_5_c5_V6: double, cat_5_c5_V2: double, cat_5_c5_V4: double, cat_5_c5_V1: double, cat_5

In [5]:
%%time
# Convert the Spark DataFrame with OHE features back to pandas
# WARNING: toPandas() collects data to the driver. Make sure the dataset fits in driver memory.

# Prefer Spark-native conversion if available (avoids Python pickling issues)
try:
    from pyspark.ml.functions import vector_to_array
    use_native = True
except Exception:
    use_native = False

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

if use_native:
    print('Using pyspark.ml.functions.vector_to_array (native)')
    df_for_pandas = spark_df_ohe.select(*(c for c in spark_df_ohe.columns if not c.endswith('_idx') and not c.endswith('_ohe')), 'features')
    df_for_pandas = df_for_pandas.withColumn('features_array', vector_to_array('features'))
    pandas_df = df_for_pandas.drop('features').toPandas()
else:
    print('vector_to_array not available — falling back to safe UDF')
    from pyspark.ml.linalg import DenseVector

    def vector_to_pylist(v):
        if v is None:
            return None
        arr = v.toArray()
        return [float(x) for x in arr]

    vector_to_pylist_udf = udf(vector_to_pylist, ArrayType(DoubleType()))
    df_for_pandas = spark_df_ohe.select(*(c for c in spark_df_ohe.columns if not c.endswith('_idx') and not c.endswith('_ohe')), 'features')
    df_for_pandas = df_for_pandas.withColumn('features_array', vector_to_pylist_udf('features'))
    pandas_df = df_for_pandas.drop('features').toPandas()

# Expand features_array into separate DataFrame columns if desired
if 'features_array' in pandas_df.columns:
    features_df = pd.DataFrame(pandas_df['features_array'].tolist(), index=pandas_df.index).add_prefix('f_')
    pandas_df = pd.concat([pandas_df.drop(columns=['features_array']), features_df], axis=1)

print(f"Converted to pandas DataFrame with shape: {pandas_df.shape}")

# Expose for downstream use
pandas_df


Using pyspark.ml.functions.vector_to_array (native)
Converted to pandas DataFrame with shape: (10000, 122)
CPU times: user 100 ms, sys: 6.93 ms, total: 107 ms
Wall time: 353 ms
Converted to pandas DataFrame with shape: (10000, 122)
CPU times: user 100 ms, sys: 6.93 ms, total: 107 ms
Wall time: 353 ms


Unnamed: 0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,...,f_51,f_52,f_53,f_54,f_55,f_56,f_57,f_58,f_59,f_60
0,c1_V3,c2_V3,c3_V3,c4_V1,c5_V1,c6_V3,c7_V1,c8_V6,c9_V4,c10_V1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,c1_V2,c2_V3,c3_V3,c4_V1,c5_V1,c6_V4,c7_V2,c8_V3,c9_V1,c10_V2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,c1_V2,c2_V2,c3_V1,c4_V2,c5_V2,c6_V3,c7_V1,c8_V4,c9_V3,c10_V2,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,c1_V2,c2_V4,c3_V2,c4_V3,c5_V6,c6_V4,c7_V2,c8_V6,c9_V2,c10_V3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,c1_V3,c2_V3,c3_V1,c4_V2,c5_V5,c6_V2,c7_V2,c8_V5,c9_V2,c10_V1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,c1_V3,c2_V2,c3_V3,c4_V1,c5_V1,c6_V3,c7_V1,c8_V1,c9_V1,c10_V3,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9996,c1_V2,c2_V2,c3_V4,c4_V3,c5_V7,c6_V6,c7_V1,c8_V2,c9_V2,c10_V4,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9997,c1_V2,c2_V2,c3_V1,c4_V3,c5_V4,c6_V3,c7_V2,c8_V1,c9_V2,c10_V1,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9998,c1_V2,c2_V1,c3_V2,c4_V1,c5_V7,c6_V2,c7_V1,c8_V6,c9_V1,c10_V1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# stop the Spark session
spark.stop()