In [9]:
import os
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkDataScienceSample") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.1
Spark UI available at: http://d45005c77c21:4040


In [10]:
from pyspark.sql.functions import col

In [11]:
# sample data frame
data = [(1, 10, 20), (2, 15, 25), (3, 5, 30)]
columns = ["id", "col2", "col1"]

df = spark.createDataFrame(data, columns)

df.show()

+---+----+----+
| id|col2|col1|
+---+----+----+
|  1|  10|  20|
|  2|  15|  25|
|  3|   5|  30|
+---+----+----+



In [12]:
# define coefficients
coeff = {"col1": 2, "col2": 3}

In [13]:
linear_combination_expr = sum(col(column) * coeff for column, coeff in coeff.items())

In [14]:
df_new = df.withColumn("score", linear_combination_expr)

df_new.show()

+---+----+----+-----+
| id|col2|col1|score|
+---+----+----+-----+
|  1|  10|  20|   70|
|  2|  15|  25|   95|
|  3|   5|  30|   75|
+---+----+----+-----+



In [15]:
spark.stop()