In [1]:
import os
import sys
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkDataScienceSample") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.1
Spark UI available at: http://d45005c77c21:4040


In [2]:
# sample data frame
data = [(1, 10, 20, 30, 40), (2, 15, 25, 35, 45), (3, 5, 30, 50, 60)]
columns = ["id", "col1", "col2", "col3", "col4"]

df = spark.createDataFrame(data, columns)
df = df.withColumn("intercept", lit(1) )

df.show()

+---+----+----+----+----+---------+
| id|col1|col2|col3|col4|intercept|
+---+----+----+----+----+---------+
|  1|  10|  20|  30|  40|        1|
|  2|  15|  25|  35|  45|        1|
|  3|   5|  30|  50|  60|        1|
+---+----+----+----+----+---------+



In [3]:
# define coefficients
coeff1 = {"col1": 2, "col2": 3, "intercept": 5}
coeff2 = {"col2": 1, "col3": 2, "intercept": 5}
coeff3 = {"col1": 1, "col3": 1, "col4": 1, "intercept": 5}
coeff4 = {"col1": 1, "col5": 10, "intercept": 5}    # this should error out

# list of coefficient dictionaries
coefficients_list = [coeff1, coeff2, coeff3, coeff4]

In [4]:
for i, coeff in enumerate(coefficients_list):
    print(f"\nApplying coefficients set {i}: {coeff}, should work: {'YES' if i != len(coefficients_list) - 1 else 'NO'}")
    linear_combination_expr = sum(col(column) * coeff for column, coeff in coeff.items())

    df.withColumn(f"score_{i}", linear_combination_expr).show()
    # ensure any buffered stdout is written immediately
    sys.stdout.flush()



Applying coefficients set 0: {'col1': 2, 'col2': 3, 'intercept': 5}, should work: YES
+---+----+----+----+----+---------+-------+
| id|col1|col2|col3|col4|intercept|score_0|
+---+----+----+----+----+---------+-------+
|  1|  10|  20|  30|  40|        1|     85|
|  2|  15|  25|  35|  45|        1|    110|
|  3|   5|  30|  50|  60|        1|    105|
+---+----+----+----+----+---------+-------+


Applying coefficients set 1: {'col2': 1, 'col3': 2, 'intercept': 5}, should work: YES
+---+----+----+----+----+---------+-------+
| id|col1|col2|col3|col4|intercept|score_1|
+---+----+----+----+----+---------+-------+
|  1|  10|  20|  30|  40|        1|     85|
|  2|  15|  25|  35|  45|        1|    100|
|  3|   5|  30|  50|  60|        1|    135|
+---+----+----+----+----+---------+-------+


Applying coefficients set 2: {'col1': 1, 'col3': 1, 'col4': 1, 'intercept': 5}, should work: YES
+---+----+----+----+----+---------+-------+
| id|col1|col2|col3|col4|intercept|score_2|
+---+----+----+----+--

{"ts": "2025-10-23 09:17:15.150", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `col5` cannot be resolved. Did you mean one of the following? [`col1`, `col2`, `col3`, `col4`, `id`]. SQLSTATE: 42703", "context": {"file": "line 3 in cell [4]", "line": "", "fragment": "col", "errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o54.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `col5` cannot be resolved. Did you mean one of the following? [`col1`, `col2`, `col3`, `col4`, `id`]. SQLSTATE: 42703;\n'Project [id#0L, col1#1L, col2#2L, col3#3L, col4#4L, intercept#5, '`+`('`+`(((col1#1L * cast(1 as bigint)) + cast(0 as bigint)), '`*`('col5, 10)), (intercept#5 * 5)) AS score_3#94]\n+- Project [id#0L, col1#1L, col2#2L, 

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `col5` cannot be resolved. Did you mean one of the following? [`col1`, `col2`, `col3`, `col4`, `id`]. SQLSTATE: 42703;
'Project [id#0L, col1#1L, col2#2L, col3#3L, col4#4L, intercept#5, '`+`('`+`(((col1#1L * cast(1 as bigint)) + cast(0 as bigint)), '`*`('col5, 10)), (intercept#5 * 5)) AS score_3#94]
+- Project [id#0L, col1#1L, col2#2L, col3#3L, col4#4L, 1 AS intercept#5]
   +- LogicalRDD [id#0L, col1#1L, col2#2L, col3#3L, col4#4L], false


In [5]:
spark.stop()