In [1]:
import os
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkDataScienceSample") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.1
Spark UI available at: http://d06357a3e3b7:4040


In [2]:
# Create a pandas DataFrame with categorical features
import pandas as pd
import numpy as np

n = 1000
rng = np.random.default_rng(42)

# Define categories for each feature
cats1 = [f"A{i}" for i in range(1, 4)]   # 3 categories
cats2 = [f"B{i}" for i in range(1, 6)]   # 5 categories
cats3 = [f"C{i}" for i in range(1, 8)]   # 7 categories

# Randomly choose categories for each row
col1 = rng.choice(cats1, size=n)
col2 = rng.choice(cats2, size=n)
col3 = rng.choice(cats3, size=n)

# Build DataFrame and set categorical dtype
df = pd.DataFrame({
    "feature_3_cat": pd.Categorical(col1, categories=cats1),
    "feature_5_cat": pd.Categorical(col2, categories=cats2),
    # "feature_7_cat": pd.Categorical(col3, categories=cats3),
})

# Show preview and category counts
print(df.shape)
display(df.head())
print('\nValue counts for each feature:')
for c in df.columns:
    print(f"\n{c}:")
    print(df[c].value_counts())


(1000, 2)


Unnamed: 0,feature_3_cat,feature_5_cat
0,A1,B5
1,A3,B4
2,A2,B3
3,A2,B5
4,A2,B1



Value counts for each feature:

feature_3_cat:
feature_3_cat
A3    336
A1    334
A2    330
Name: count, dtype: int64

feature_5_cat:
feature_5_cat
B1    206
B5    204
B3    203
B2    201
B4    186
Name: count, dtype: int64


In [3]:
# Convert pandas categorical columns to string before creating Spark DataFrame
# This avoids Spark inferring pandas Categorical dtype incorrectly.
cat_cols = [c for c in df.columns if pd.api.types.is_categorical_dtype(df[c])]
if cat_cols:
    df[cat_cols] = df[cat_cols].astype(str)

# Create Spark DataFrame from pandas DataFrame
# `spark` is expected to be defined in a previous cell (SparkSession)
spark_df = spark.createDataFrame(df)

# Show schema and first few rows
print("Spark DataFrame schema:")
spark_df.printSchema()
print("\nPreview:")
spark_df.show(10, truncate=False)

# Optional: cache and count to materialize
spark_df = spark_df.cache()
print(f"Row count: {spark_df.count()}")

# Expose for downstream cells
spark_df


  cat_cols = [c for c in df.columns if pd.api.types.is_categorical_dtype(df[c])]


Spark DataFrame schema:
root
 |-- feature_3_cat: string (nullable = true)
 |-- feature_5_cat: string (nullable = true)


Preview:
+-------------+-------------+
|feature_3_cat|feature_5_cat|
+-------------+-------------+
|A1           |B5           |
|A3           |B4           |
|A2           |B3           |
|A2           |B5           |
|A2           |B1           |
|A3           |B5           |
|A1           |B5           |
|A3           |B3           |
|A1           |B4           |
|A1           |B3           |
+-------------+-------------+
only showing top 10 rows
+-------------+-------------+
|feature_3_cat|feature_5_cat|
+-------------+-------------+
|A1           |B5           |
|A3           |B4           |
|A2           |B3           |
|A2           |B5           |
|A2           |B1           |
|A3           |B5           |
|A1           |B5           |
|A3           |B3           |
|A1           |B4           |
|A1           |B3           |
+-------------+-------------+
only 

DataFrame[feature_3_cat: string, feature_5_cat: string]

In [4]:
# One-hot encode categorical features using Spark ML
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Identify categorical columns in spark_df (we converted them to strings earlier)
cat_cols = [c for c, t in spark_df.dtypes if t == 'string']
print('Categorical columns detected:', cat_cols)

# Create indexers and encoders
indexers = [StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid='keep') for c in cat_cols]
encoders = [OneHotEncoder(inputCol=f"{c}_idx", outputCol=f"{c}_ohe") for c in cat_cols]

# Assemble all OHE vectors into a single feature vector
ohe_cols = [f"{c}_ohe" for c in cat_cols]
assembler = VectorAssembler(inputCols=ohe_cols, outputCol='features')

pipeline = Pipeline(stages=indexers + encoders + [assembler])
model = pipeline.fit(spark_df)

spark_df_ohe = model.transform(spark_df)

print('Schema after OHE pipeline:')
spark_df_ohe.printSchema()
print('\nPreview of OHE features:')
spark_df_ohe.select(*cat_cols, *ohe_cols, 'features').show(10, truncate=False)

# Cache and count to materialize
spark_df_ohe = spark_df_ohe.cache()
print(f"Row count after transform: {spark_df_ohe.count()}")

# Expose for downstream cells
spark_df_ohe

Categorical columns detected: ['feature_3_cat', 'feature_5_cat']
Schema after OHE pipeline:
root
 |-- feature_3_cat: string (nullable = true)
 |-- feature_5_cat: string (nullable = true)
 |-- feature_3_cat_idx: double (nullable = false)
 |-- feature_5_cat_idx: double (nullable = false)
 |-- feature_3_cat_ohe: vector (nullable = true)
 |-- feature_5_cat_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)


Preview of OHE features:
Schema after OHE pipeline:
root
 |-- feature_3_cat: string (nullable = true)
 |-- feature_5_cat: string (nullable = true)
 |-- feature_3_cat_idx: double (nullable = false)
 |-- feature_5_cat_idx: double (nullable = false)
 |-- feature_3_cat_ohe: vector (nullable = true)
 |-- feature_5_cat_ohe: vector (nullable = true)
 |-- features: vector (nullable = true)


Preview of OHE features:
+-------------+-------------+-----------------+-----------------+-------------------+
|feature_3_cat|feature_5_cat|feature_3_cat_ohe|feature_5_cat_ohe|features  

DataFrame[feature_3_cat: string, feature_5_cat: string, feature_3_cat_idx: double, feature_5_cat_idx: double, feature_3_cat_ohe: vector, feature_5_cat_ohe: vector, features: vector]

In [5]:
# Convert the Spark DataFrame with OHE features back to pandas
# WARNING: toPandas() collects data to the driver. Make sure the dataset fits in driver memory.

# Prefer Spark-native conversion if available (avoids Python pickling issues)
try:
    from pyspark.ml.functions import vector_to_array
    use_native = True
except Exception:
    use_native = False

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

if use_native:
    print('Using pyspark.ml.functions.vector_to_array (native)')
    df_for_pandas = spark_df_ohe.select(*(c for c in spark_df_ohe.columns if not c.endswith('_idx') and not c.endswith('_ohe')), 'features')
    df_for_pandas = df_for_pandas.withColumn('features_array', vector_to_array('features'))
    pandas_df = df_for_pandas.drop('features').toPandas()
else:
    print('vector_to_array not available — falling back to safe UDF')
    from pyspark.ml.linalg import DenseVector

    def vector_to_pylist(v):
        if v is None:
            return None
        arr = v.toArray()
        return [float(x) for x in arr]

    vector_to_pylist_udf = udf(vector_to_pylist, ArrayType(DoubleType()))
    df_for_pandas = spark_df_ohe.select(*(c for c in spark_df_ohe.columns if not c.endswith('_idx') and not c.endswith('_ohe')), 'features')
    df_for_pandas = df_for_pandas.withColumn('features_array', vector_to_pylist_udf('features'))
    pandas_df = df_for_pandas.drop('features').toPandas()

# Expand features_array into separate DataFrame columns if desired
if 'features_array' in pandas_df.columns:
    features_df = pd.DataFrame(pandas_df['features_array'].tolist(), index=pandas_df.index).add_prefix('f_')
    pandas_df = pd.concat([pandas_df.drop(columns=['features_array']), features_df], axis=1)

print(f"Converted to pandas DataFrame with shape: {pandas_df.shape}")

# Expose for downstream use
pandas_df


Using pyspark.ml.functions.vector_to_array (native)
Converted to pandas DataFrame with shape: (1000, 10)
Converted to pandas DataFrame with shape: (1000, 10)


Unnamed: 0,feature_3_cat,feature_5_cat,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7
0,A1,B5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,A3,B4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,A2,B3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,A2,B5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,A2,B1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
995,A2,B1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
996,A2,B3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
997,A2,B5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
998,A3,B2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
# stop the Spark session
spark.stop()