In [1]:
import os
os.environ['SPARK_VERSION'] = '3.5.0'

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.sql import SparkSession
from delta import *
import pydeequ
from pydeequ.analyzers import *

spark = SparkSession.builder.master('local[*]').appName('regression') \
    .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension') \
    .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog') \
    .config('spark.jars.packages', pydeequ.deequ_maven_coord) \
    .config('spark.jars.excludes', pydeequ.f2j_maven_coord) \
    .config('spark.sql.warehouse.dir', '../spark-warehouse') \
    .config('spark.driver.extraJavaOptions', '-Dderby.system.home="../metastore_db/"') \
    .config('spark.driver.memory', '10g') \
    .config('spark.driver.maxResultSize', '10g') \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql('SHOW TABLES FROM OtherDB').show(truncate=False)

+---------+----------------+-----------+
|namespace|tableName       |isTemporary|
+---------+----------------+-----------+
|otherdb  |crime           |false      |
|otherdb  |dg_orders       |false      |
|otherdb  |trip            |false      |
|otherdb  |vw_crime        |false      |
|otherdb  |vw_trip_duration|false      |
|otherdb  |vw_trip_report  |false      |
+---------+----------------+-----------+



In [3]:
spark.sql('SELECT * FROM OtherDB.vw_trip_duration LIMIT 3').show(truncate=False)

+------+--------+--------+----+---------+-------+
|amount|duration|distance|hour|passenger|payment|
+------+--------+--------+----+---------+-------+
|35.35 |1200    |5.57    |0   |NULL     |0      |
|20.47 |479     |2.6     |0   |NULL     |0      |
|23.83 |737     |3.7     |0   |NULL     |0      |
+------+--------+--------+----+---------+-------+



In [4]:
data = spark.sql('''
    SELECT * 
    FROM OtherDB.vw_trip_duration
    WHERE passenger IS NOT NULL
    LIMIT 1000000
''')

In [5]:
column_pairs = [(col1, col2) for col1 in data.columns for col2 in data.columns]

correlation_df = spark.createDataFrame(
    [(col1, col2, data.select(col1, col2).corr(col1, col2)) for col1, col2 in column_pairs],
    ['Column1', 'Column2', 'Correlation']
)

correlation_df.show(truncate=False)

+--------+---------+----------------------+
|Column1 |Column2  |Correlation           |
+--------+---------+----------------------+
|amount  |amount   |1.0                   |
|amount  |duration |0.23189308969417818   |
|amount  |distance |0.1384123937284394    |
|amount  |hour     |0.02078728915571583   |
|amount  |passenger|0.02987173727206933   |
|amount  |payment  |NaN                   |
|duration|amount   |0.23189308969417818   |
|duration|duration |1.0                   |
|duration|distance |0.0345320360099649    |
|duration|hour     |-0.0021780198183030354|
|duration|passenger|0.019065320741348347  |
|duration|payment  |NaN                   |
|distance|amount   |0.1384123937284394    |
|distance|duration |0.03453203600996489   |
|distance|distance |1.0                   |
|distance|hour     |0.0015839586060821895 |
|distance|passenger|0.003116172311718919  |
|distance|payment  |NaN                   |
|hour    |amount   |0.020787289155715837  |
|hour    |duration |-0.002178019

In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

assembler = VectorAssembler(inputCols=['duration', 'hour', 'distance'], outputCol='features')
data = assembler.transform(data)

train_data, test_data = data.randomSplit([0.8, 0.2])
lr = LinearRegression(featuresCol='features', labelCol='amount')
lrModel = lr.fit(train_data)

predictions = lrModel.transform(test_data)
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol='amount', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 21.4873


In [7]:
lrModel.coefficients

DenseVector([0.002, 0.0824, 0.089])

In [8]:
from pyspark.sql.functions import col, sqrt, abs, when

predictions_with_distance = predictions.withColumn(
    'distance', sqrt(abs(col('prediction') - col('amount')))
)

In [9]:
def distance_to_color(distance):
    return 'red' if distance > 3 else 'green'

In [10]:
from pyspark.sql.functions import udf, col, sqrt, abs, when, StringType

distance_to_color_udf = udf(distance_to_color, StringType())

predictions_colored = predictions_with_distance.withColumn(
    'color', distance_to_color_udf(col('distance'))
)

pdf = predictions_colored.toPandas()

In [None]:
import plotly.express as px

fig = px.scatter(
    pdf,
    x='amount',
    y='prediction',
    color='color',
    trendline='ols'
)

fig.update_layout(
    plot_bgcolor='white',
    xaxis_showgrid=False,
    yaxis_showgrid=False,
)

fig.show()

In [None]:
lrModel.write().overwrite().save('trip_regression_model')
#loaded_model = PipelineModel.load('trip_regression_model')