In [6]:
from xgboost.spark import SparkXGBRegressor
spark_reg_estimator = SparkXGBRegressor(
  features_col="features",
  label_col="label",
  num_workers=10,
)

In [7]:
times=[]
import time

In [8]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("data.csv") \
    .getOrCreate()

# Read the CSV file into a Spark DataFrame
file_path = "/home/ubuntu/data.csv"  # Replace this with the path to your CSV file
train_spark_dataframe = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema and the first few rows of the DataFrame
train_spark_dataframe.printSchema()
train_spark_dataframe.show(5)

                                                                                

root
 |-- trip_distance: double (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- pickup_hour: integer (nullable = true)

+--------------------+---------------+------------+------------+--------------------+-----------+
|       trip_distance|passenger_count|PULocationID|DOLocationID|          tip_amount|pickup_hour|
+--------------------+---------------+------------+------------+--------------------+-----------+
| -0.3661467419730907|              1|         151|         239|-0.01412124292201...|          0|
|-0.07707474953506906|              1|         239|         246|-0.03583707603552034|          0|
|  -0.760335822570393|              3|         236|         236|-0.06924605005629836|         13|
|  -0.760335822570393|              5|         193|         193|-0.06924605005629836|         15|
|  -0.760335822570393|              5| 

In [9]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

# Start Spark session if it's not already started
# Uncomment the next two lines if the Spark session isn't initialized
# spark = SparkSession.builder \
#     .appName("Tip Prediction Model") \
#     .getOrCreate()

# Define the assembler with the input columns that will be features
assembler = VectorAssembler(
    inputCols=["trip_distance", "passenger_count", "PULocationID", "DOLocationID", "pickup_hour"],
    outputCol="features"
)

# Assuming 'train_spark_dataframe' is already loaded and defined
# Transform the data to create feature vectors
transformed_dataframe = assembler.transform(train_spark_dataframe)

# Select only the necessary columns, including the newly created 'features' and the label 'tip_amount'
final_dataframe = transformed_dataframe.select("features", "tip_amount")

# Split the data approximately into 70% training and 30% testing
train_data, test_data = final_dataframe.randomSplit([0.7, 0.3], seed=42)

In [10]:
from xgboost.spark import SparkXGBRegressor
t0 = time.time()
# Create an instance of SparkXGBRegressor
xgb_regressor = SparkXGBRegressor(
    features_col="features",
    label_col="tip_amount",
    prediction_col="predicted_tip_amount",
    objective="reg:squarederror",
    numWorkers=10,
    eta=0.1
)
t1 = time.time()
times.append(t1-t0)

In [11]:
t0 = time.time()
xgb_regressor_model = xgb_regressor.fit(train_data)
t1 = time.time()
times.append(t1-t0)

2024-04-19 15:06:10,878 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'numWorkers': 10, 'eta': 0.1, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[15:08:32] task 0 got new rank 0                                    (0 + 1) / 1]
Parameters: { "numWorkers" } are not used.

2024-04-19 15:11:02,056 INFO XGBoost-PySpark: _fit Finished xgboost training!   


In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
t0 = time.time()

# Assuming 'xgb_regressor_model' has already been trained and 'test_data' prepared
predictions = xgb_regressor_model.transform(test_data)
t1 = time.time()
times.append(t1-t0)
# Evaluate the model using the correct parameter names
evaluator = RegressionEvaluator(
    labelCol="tip_amount",  # Correct parameter name is 'labelCol' not 'label_col'
    predictionCol="predicted_tip_amount",  # Correct parameter name is 'predictionCol'
    metricName="rmse"  # This is correct; other options could be "mae" or "r2"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

INFO:XGBoost-PySpark:Do the inference on the CPUs                   (0 + 2) / 9]
2024-04-19 15:11:06,401 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                   (2 + 2) / 9]
2024-04-19 15:11:37,665 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-04-19 15:12:09,836 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                   (6 + 2) / 9]
2024-04-19 15:12:41,557 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs=====>             (7 + 2) / 9]

Root Mean Squared Error (RMSE) on test data = 1.82001


                                                                                

In [13]:
times

[0.0012483596801757812, 292.3346936702728, 0.2451019287109375]

In [14]:
from xgboost.spark import SparkXGBRegressor
spark_reg_estimator = SparkXGBRegressor(
  features_col="features",
  label_col="label",
  num_workers=6,
)

In [15]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("data.csv") \
    .getOrCreate()

# Read the CSV file into a Spark DataFrame
file_path = "/home/ubuntu/data.csv"  # Replace this with the path to your CSV file
train_spark_dataframe = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema and the first few rows of the DataFrame
train_spark_dataframe.printSchema()
train_spark_dataframe.show(5)



root
 |-- trip_distance: double (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- pickup_hour: integer (nullable = true)

+--------------------+---------------+------------+------------+--------------------+-----------+
|       trip_distance|passenger_count|PULocationID|DOLocationID|          tip_amount|pickup_hour|
+--------------------+---------------+------------+------------+--------------------+-----------+
| -0.3661467419730907|              1|         151|         239|-0.01412124292201...|          0|
|-0.07707474953506906|              1|         239|         246|-0.03583707603552034|          0|
|  -0.760335822570393|              3|         236|         236|-0.06924605005629836|         13|
|  -0.760335822570393|              5|         193|         193|-0.06924605005629836|         15|
|  -0.760335822570393|              5| 

                                                                                

In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

# Start Spark session if it's not already started
# Uncomment the next two lines if the Spark session isn't initialized
# spark = SparkSession.builder \
#     .appName("Tip Prediction Model") \
#     .getOrCreate()

# Define the assembler with the input columns that will be features
assembler = VectorAssembler(
    inputCols=["trip_distance", "passenger_count", "PULocationID", "DOLocationID", "pickup_hour"],
    outputCol="features"
)

# Assuming 'train_spark_dataframe' is already loaded and defined
# Transform the data to create feature vectors
transformed_dataframe = assembler.transform(train_spark_dataframe)

# Select only the necessary columns, including the newly created 'features' and the label 'tip_amount'
final_dataframe = transformed_dataframe.select("features", "tip_amount")

# Split the data approximately into 70% training and 30% testing
train_data, test_data = final_dataframe.randomSplit([0.7, 0.3], seed=42)

In [17]:
from xgboost.spark import SparkXGBRegressor
t0 = time.time()
# Create an instance of SparkXGBRegressor
xgb_regressor = SparkXGBRegressor(
    features_col="features",
    label_col="tip_amount",
    prediction_col="predicted_tip_amount",
    objective="reg:squarederror",
    numWorkers=6,
    eta=0.1
)
t1 = time.time()
times.append(t1-t0)

In [18]:
t0 = time.time()
xgb_regressor_model = xgb_regressor.fit(train_data)
t1 = time.time()
times.append(t1-t0)

2024-04-19 15:15:21,960 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'numWorkers': 6, 'eta': 0.1, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[15:17:34] task 0 got new rank 0                                    (0 + 1) / 1]
Parameters: { "numWorkers" } are not used.

2024-04-19 15:20:00,301 INFO XGBoost-PySpark: _fit Finished xgboost training!   


In [19]:
from pyspark.ml.evaluation import RegressionEvaluator
t0 = time.time()

# Assuming 'xgb_regressor_model' has already been trained and 'test_data' prepared
predictions = xgb_regressor_model.transform(test_data)
t1 = time.time()
times.append(t1-t0)
# Evaluate the model using the correct parameter names
evaluator = RegressionEvaluator(
    labelCol="tip_amount",  # Correct parameter name is 'labelCol' not 'label_col'
    predictionCol="predicted_tip_amount",  # Correct parameter name is 'predictionCol'
    metricName="rmse"  # This is correct; other options could be "mae" or "r2"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

INFO:XGBoost-PySpark:Do the inference on the CPUs                   (0 + 2) / 9]
2024-04-19 15:20:04,365 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-04-19 15:20:35,181 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-04-19 15:21:06,869 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                   (4 + 2) / 9]
2024-04-19 15:21:38,255 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                   (6 + 2) / 9]
2024-04-19 15:22:10,103 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

Root Mean Squared Error (RMSE) on test data = 1.82001


                                                                                

In [20]:
times

[0.0012483596801757812,
 292.3346936702728,
 0.2451019287109375,
 0.005173206329345703,
 278.7103703022003,
 0.14718866348266602]

In [21]:
from xgboost.spark import SparkXGBRegressor
spark_reg_estimator = SparkXGBRegressor(
  features_col="features",
  label_col="label",
  num_workers=2,
)

In [22]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("data.csv") \
    .getOrCreate()

# Read the CSV file into a Spark DataFrame
file_path = "/home/ubuntu/data.csv"  # Replace this with the path to your CSV file
train_spark_dataframe = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the schema and the first few rows of the DataFrame
train_spark_dataframe.printSchema()
train_spark_dataframe.show(5)



root
 |-- trip_distance: double (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- pickup_hour: integer (nullable = true)

+--------------------+---------------+------------+------------+--------------------+-----------+
|       trip_distance|passenger_count|PULocationID|DOLocationID|          tip_amount|pickup_hour|
+--------------------+---------------+------------+------------+--------------------+-----------+
| -0.3661467419730907|              1|         151|         239|-0.01412124292201...|          0|
|-0.07707474953506906|              1|         239|         246|-0.03583707603552034|          0|
|  -0.760335822570393|              3|         236|         236|-0.06924605005629836|         13|
|  -0.760335822570393|              5|         193|         193|-0.06924605005629836|         15|
|  -0.760335822570393|              5| 

                                                                                

In [23]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

# Start Spark session if it's not already started
# Uncomment the next two lines if the Spark session isn't initialized
# spark = SparkSession.builder \
#     .appName("Tip Prediction Model") \
#     .getOrCreate()

# Define the assembler with the input columns that will be features
assembler = VectorAssembler(
    inputCols=["trip_distance", "passenger_count", "PULocationID", "DOLocationID", "pickup_hour"],
    outputCol="features"
)

# Assuming 'train_spark_dataframe' is already loaded and defined
# Transform the data to create feature vectors
transformed_dataframe = assembler.transform(train_spark_dataframe)

# Select only the necessary columns, including the newly created 'features' and the label 'tip_amount'
final_dataframe = transformed_dataframe.select("features", "tip_amount")

# Split the data approximately into 70% training and 30% testing
train_data, test_data = final_dataframe.randomSplit([0.7, 0.3], seed=42)

In [24]:
from xgboost.spark import SparkXGBRegressor
t0 = time.time()
# Create an instance of SparkXGBRegressor
xgb_regressor = SparkXGBRegressor(
    features_col="features",
    label_col="tip_amount",
    prediction_col="predicted_tip_amount",
    objective="reg:squarederror",
    numWorkers=2,
    eta=0.1
)
t1 = time.time()
times.append(t1-t0)

In [25]:
t0 = time.time()
xgb_regressor_model = xgb_regressor.fit(train_data)
t1 = time.time()
times.append(t1-t0)

2024-04-19 15:24:16,498 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'numWorkers': 2, 'eta': 0.1, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
[15:26:30] task 0 got new rank 0                                    (0 + 1) / 1]
Parameters: { "numWorkers" } are not used.

2024-04-19 15:28:59,313 INFO XGBoost-PySpark: _fit Finished xgboost training!   


In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
t0 = time.time()

# Assuming 'xgb_regressor_model' has already been trained and 'test_data' prepared
predictions = xgb_regressor_model.transform(test_data)
t1 = time.time()
times.append(t1-t0)
# Evaluate the model using the correct parameter names
evaluator = RegressionEvaluator(
    labelCol="tip_amount",  # Correct parameter name is 'labelCol' not 'label_col'
    predictionCol="predicted_tip_amount",  # Correct parameter name is 'predictionCol'
    metricName="rmse"  # This is correct; other options could be "mae" or "r2"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

INFO:XGBoost-PySpark:Do the inference on the CPUs
2024-04-19 15:29:02,840 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-04-19 15:29:33,684 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                   (4 + 2) / 9]
2024-04-19 15:30:05,547 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
2024-04-19 15:30:37,858 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs
INFO:XGBoost-PySpark:Do the inference on the CPUs                   (6 + 2) / 9]
2024-04-19 15:31:09,700 INFO XGBoost-PySpark: predict_udf Do the inference on the CPUs

Root Mean Squared Error (RMSE) on test data = 1.82001


                                                                                

In [27]:
times

[0.0012483596801757812,
 292.3346936702728,
 0.2451019287109375,
 0.005173206329345703,
 278.7103703022003,
 0.14718866348266602,
 0.001135110855102539,
 283.0397434234619,
 0.10004115104675293]