# Scaling with PySpark ML

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/07 22:39:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/07 22:39:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Load the data

In [3]:
flights = spark.read.parquet("../data/flights.parquet")
flights.select("distance").show(10)

                                                                                

+--------+
|distance|
+--------+
|  1400.0|
|  1416.0|
|  1089.0|
|  1576.0|
|   762.0|
|   719.0|
|  1065.0|
|   229.0|
|   944.0|
|   733.0|
+--------+
only showing top 10 rows



## Scale the data

In [38]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.functions import vector_to_array

# assemble vector column
va = VectorAssembler(inputCols=["distance"], outputCol="distanceVector")
assembled = va.transform(flights)

ss = StandardScaler(inputCol="distanceVector", outputCol="scaledDistanceVector", withMean=True, withStd=True)
model = ss.fit(assembled)
result = model.transform(assembled)

# disassemble vector column
result.withColumn("distance", vector_to_array("scaledDistanceVector").getItem(0)).select("distance").show(10)

+--------------------+
|            distance|
+--------------------+
|  0.4910954362463181|
|  0.5129166026052225|
| 0.06694651514511192|
|  0.7311282661942675|
| -0.3790235723149987|
|-0.43766795690455457|
|0.034214765606755175|
| -1.1059411766460048|
|-0.13080780498246009|
| -0.4185744363405131|
+--------------------+
only showing top 10 rows

