* Numeric: Standardize
    * Map data values from their original range to -1 to 1
    * Mean value of 0
    * Normally distributed with standard deviation of 1
    * Used when attributes have different scales and ML algorithms assueme a normal distribution

In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("YourAppName").getOrCreate()

from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

In [6]:
# Create a dataframe with "id" and "features"
features_df = spark.createDataFrame([
    (1, Vectors.dense([10.0, 10000.0, 1.0]),),
    (2, Vectors.dense([20.0, 30000.0, 2.0]),),
    (3, Vectors.dense([30.0, 40000.0, 3.0]),) ], 
    ["id", "features"])

In [9]:
features_df.take(1)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]))]

In [19]:
# Create a scaler object
feature_scaler = MinMaxScaler(inputCol="features", outputCol="sfeatures", min=0.0, max=1.0)

In [21]:
# Fit the input features and create a model
smodel = feature_scaler.fit(features_df)

In [22]:
# Apply the transformation to tyhe dataframe
sfeatures_df = smodel.transform(features_df)

In [23]:
sfeatures_df.take(3)

[Row(id=1, features=DenseVector([10.0, 10000.0, 1.0]), sfeatures=SparseVector(3, {})),
 Row(id=2, features=DenseVector([20.0, 30000.0, 2.0]), sfeatures=DenseVector([0.5, 0.6667, 0.5])),
 Row(id=3, features=DenseVector([30.0, 40000.0, 3.0]), sfeatures=DenseVector([1.0, 1.0, 1.0]))]

In [24]:
sfeatures_df.select("features", "sfeatures").show()

+------------------+--------------------+
|          features|           sfeatures|
+------------------+--------------------+
|[10.0,10000.0,1.0]|           (3,[],[])|
|[20.0,30000.0,2.0]|[0.5,0.6666666666...|
|[30.0,40000.0,3.0]|       [1.0,1.0,1.0]|
+------------------+--------------------+



In [25]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

# Define a UDF to convert SparseVector to DenseVector
sparse_to_dense_udf = udf(lambda sv: sv.toArray().tolist(), ArrayType(DoubleType()))

# Apply the UDF to the sfeatures column
sfeatures_df2 = sfeatures_df.withColumn("sfeatures", sparse_to_dense_udf("sfeatures"))

sfeatures_df2.select("features", "sfeatures").show()



+------------------+--------------------+
|          features|           sfeatures|
+------------------+--------------------+
|[10.0,10000.0,1.0]|     [0.0, 0.0, 0.0]|
|[20.0,30000.0,2.0]|[0.5, 0.666666666...|
|[30.0,40000.0,3.0]|     [1.0, 1.0, 1.0]|
+------------------+--------------------+



                                                                                