In [None]:
! pip install xgboost
! pip install pandas
! pip install scikit-learn
! pip install pyarrow

In [1]:
import pyspark
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import DoubleType, FloatType, LongType
from pyspark.sql.functions import col, mean, stddev
from pyspark.sql import Window

import pyspark.sql.functions as F
from pyspark.ml import Transformer
from pyspark.sql import DataFrame
import pandas as pd
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable 

from xgboost.spark import SparkXGBRegressor
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName("check_files").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/18 00:58:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_location1='/cephfs/summary_stats/train/summary_stats_parquet_0_25.parquet'
data_location2='/cephfs/summary_stats/train/summary_stats_parquet_159_25.parquet'

In [4]:
df1=spark.read.format('parquet').load(data_location1)
df1.show(1)

+-----+-----+------------------+------------------+------+-----+-----+------------------+--------------------+-----------------+------------------+----------+----------+---------------+------+
|NumId|Chunk|              Mean|            StdDev|Median|  Min|  Max|      AvgFirstDiff|          AvgSecDiff|     StdFirstDiff|        StdSecDiff|CountAbove|CountBelow|TotalOutOfRange|target|
+-----+-----+------------------+------------------+------+-----+-----+------------------+--------------------+-----------------+------------------+----------+----------+---------------+------+
|   63|    1|191.84027777777777|23.943651386837775| 190.0|146.0|282.0|0.1597222222222222|-0.00347222222222...|4.784066461561062|3.1072560509995135|        23|         0|             23|     6|
+-----+-----+------------------+------------------+------+-----+-----+------------------+--------------------+-----------------+------------------+----------+----------+---------------+------+
only showing top 1 row



In [5]:
class ColumnScaler(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    def _transform(self, df):
        double_cols=[f.name for f in df.schema.fields if isinstance(f.dataType, DoubleType)]
        float_cols=[f.name for f in df.schema.fields if isinstance(f.dataType, FloatType)]
        long_cols=[f.name for f in df.schema.fields if isinstance(f.dataType, LongType)]

        all_numerical=list(set(double_cols+float_cols+long_cols))
        all_numerical.remove('target')
        
        for num_column in all_numerical:
            input_col = f"{num_column}"
            output_col = f"scaled_{num_column}"

            w = Window.partitionBy('NumId')

            mu = mean(input_col).over(w)
            sigma = stddev(input_col).over(w)

            df=df.withColumn(output_col, (col(input_col) - mu)/(sigma))
            
        return df

In [6]:
double_cols=[f.name for f in df1.schema.fields if isinstance(f.dataType, DoubleType)]
float_cols=[f.name for f in df1.schema.fields if isinstance(f.dataType, FloatType)]
long_cols=[f.name for f in df1.schema.fields if isinstance(f.dataType, LongType)]

all_numerical=list(set(double_cols+float_cols+long_cols))
all_numerical.remove('target')

featureArr = [('scaled_' + f) for f in all_numerical]

columns_scaler=ColumnScaler()

va2 = VectorAssembler(inputCols=featureArr, outputCol="features", handleInvalid='skip')

stages=[columns_scaler]+[va2]

In [7]:
features_col="features"
label_name="target"

xgb_regression=SparkXGBRegressor(features_col=features_col, 
                                  label_col=label_name,
                                  random_state=123,
                                  use_gpu=False)


stages.append(xgb_regression)
pipeline=Pipeline(stages=stages)

model=pipeline.fit(df1)

23/05/18 00:58:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[00:59:02] task 0 got new rank 0                                    (0 + 1) / 1]


  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.





In [8]:
model.stages[-1].

<xgboost.core.Booster at 0x7f4c5a6ce3b0>

In [None]:
df2=spark.read.format('parquet').load(data_location2)
df2.show(1)

In [None]:
double_cols=[f.name for f in df2.schema.fields if isinstance(f.dataType, DoubleType)]
float_cols=[f.name for f in df2.schema.fields if isinstance(f.dataType, FloatType)]
long_cols=[f.name for f in df2.schema.fields if isinstance(f.dataType, LongType)]

all_numerical=list(set(double_cols+float_cols+long_cols))
all_numerical.remove('target')

featureArr = [('scaled_' + f) for f in all_numerical]

columns_scaler=ColumnScaler()

va2 = VectorAssembler(inputCols=featureArr, outputCol="features", handleInvalid='skip')

stages=[columns_scaler]+[va2]