In [None]:
# medir tiempos
%load_ext autotime

In [None]:
import findspark
findspark.init("/usr/local/spark/spark-3.1.1-bin-hadoop2.7")    #para linux
#findspark.init()                                                 #para windows

In [None]:
from pyspark import SparkConf, SparkContext
# Variable de configuración
conf = SparkConf().setMaster("local[*]").setAppName("GBTRegressor").set("spark.driver.maxResultSize","0")
# iniciamos un contexto spark (solo se ejecuta uno. Para ejecutar otra vez , reiniciar el kernel)
sc = SparkContext(conf = conf)
sc

In [None]:
from pyspark.sql.types import StringType
from pyspark import SQLContext
# le pasamos el contexto anterior
sqlContext = SQLContext(sc)
import os
path="file:"+os.getcwd()+"/Output"
dfspark = sqlContext.read.format('csv').option("header","true").option("inferSchema","true").load(path+'/*.csv')

In [None]:
dfspark.printSchema()

##### creando un vector de caracteristicas

In [None]:
from pyspark.ml.feature import VectorAssembler
ignore=['fare_amount',
        'pickup_datetime',
        'pickup_longitude',
       'pickup_latitude',
       'dropoff_longitude',
       'dropoff_latitude',
       'dif_latitude',
       'dif_longitude']
vectorAssembler = VectorAssembler(inputCols=[x for x in dfspark.columns  
                  if x not in ignore], outputCol = 'features')
df = vectorAssembler.transform(dfspark)
dfmodel = df.select(['features', 'fare_amount'])

In [None]:
# Separar la data en 80% entrenamiento y 20% test
dftrain, dftest = dfmodel.randomSplit([0.8, 0.2])
print("tenemos %d datos de entrenamiento y %d datos de prueba." % (dftrain.count(), dftest.count()))

### GBTRegressor

In [None]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

gbt = GBTRegressor(labelCol="fare_amount", featuresCol="features")

In [None]:
gbtrparamGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 5, 8])
             .addGrid(gbt.maxBins, [10, 20, 40])
             .addGrid(gbt.maxIter, [5, 10, 20])
             .build())

In [None]:
gbevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="fare_amount", metricName="rmse")

In [None]:
# Create 3-fold CrossValidator
gbcv = CrossValidator(estimator = gbt,
                      estimatorParamMaps = gbtrparamGrid,
                      evaluator = gbevaluator,
                      numFolds = 3)

In [None]:
gbcvModel = gbcv.fit(dftrain)

In [None]:
print(gbcvModel)

In [None]:
gbpredictions = gbcvModel.transform(dftest)

In [None]:
print('RMSE:', gbevaluator.evaluate(gbpredictions))

In [None]:
path="file:"+os.getcwd()+"/models/"
gbt.save(path+"gbtregressor")
gbcv.save(path+"gbcrosvalidator")
gbcvModel.save(path+"Model")
gbevaluator.save(path+"evaluator")