In [1]:
import pyspark
from pyspark.sql import SparkSession

from pyspark.sql.functions import col, mean, stddev
from pyspark.sql import Window
from pyspark.sql.types import DoubleType, FloatType, LongType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName("check_files").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/17 07:21:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_location1='/cephfs/summary_stats/train/summary_stats_parquet_164_25.parquet'

In [4]:
df = spark.read.format('parquet').load(data_location1)
df.show(3)

+-----+-----+------------------+------------------+---------+-----+-----+--------------------+--------------------+------------------+------------------+----------+----------+---------------+------+
|NumId|Chunk|              Mean|            StdDev|   Median|  Min|  Max|        AvgFirstDiff|          AvgSecDiff|      StdFirstDiff|        StdSecDiff|CountAbove|CountBelow|TotalOutOfRange|target|
+-----+-----+------------------+------------------+---------+-----+-----+--------------------+--------------------+------------------+------------------+----------+----------+---------------+------+
|  113|    1|247.77375030517578|32.229656246101904|    235.0|215.0|335.0| 0.05238744947645399| 0.11835967169867621| 32.35201916285883| 54.46174867696771|        77|         0|             77|    13|
|  113|    2| 222.4494400024414|27.724493105956014|221.91241|147.0|297.0|-0.05238744947645399|-0.12530411614312065|28.249610219199937| 50.95378506373618|        27|         0|             27|    50|
|  11

In [5]:
df.schema

StructType([StructField('NumId', IntegerType(), True), StructField('Chunk', IntegerType(), True), StructField('Mean', DoubleType(), True), StructField('StdDev', DoubleType(), True), StructField('Median', FloatType(), True), StructField('Min', FloatType(), True), StructField('Max', FloatType(), True), StructField('AvgFirstDiff', DoubleType(), True), StructField('AvgSecDiff', DoubleType(), True), StructField('StdFirstDiff', DoubleType(), True), StructField('StdSecDiff', DoubleType(), True), StructField('CountAbove', LongType(), True), StructField('CountBelow', LongType(), True), StructField('TotalOutOfRange', LongType(), True), StructField('target', LongType(), True)])

In [6]:
double_cols=[f.name for f in df.schema.fields if isinstance(f.dataType, DoubleType)]
float_cols=[f.name for f in df.schema.fields if isinstance(f.dataType, FloatType)]
long_cols=[f.name for f in df.schema.fields if isinstance(f.dataType, LongType)]

all_numerical=list(set(double_cols+float_cols+long_cols))
all_numerical.remove('target')

featureArr = [('scaled_' + f) for f in all_numerical]
        

In [7]:
all_numerical

['StdSecDiff',
 'Mean',
 'AvgSecDiff',
 'Median',
 'StdDev',
 'Max',
 'Min',
 'CountBelow',
 'AvgFirstDiff',
 'CountAbove',
 'StdFirstDiff',
 'TotalOutOfRange']

In [None]:
for num_column in all_numerical:
    input_col = f"{num_column}"
    output_col = f"scaled_{num_column}"

    w = Window.partitionBy('NumId')

    mu = mean(input_col).over(w)
    sigma = stddev(input_col).over(w)

    df=df.withColumn(output_col, (col(input_col) - mu)/(sigma))
    

In [None]:
va2 = VectorAssembler(inputCols=featureArr, outputCol="features")

stages = [va2]



In [None]:
pipeline=Pipeline(stages=stages)

model=pipeline.fit(df)

model.transform(df).select('features').show(3, truncate=False)