In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import to_date

# Load trained Linear Regression model and data
model_path = "gs://project-bucket-kl/models/flight_price_model"
model = LinearRegressionModel.load(model_path)
data_path = "gs://project-bucket-kl/trusted/feature_engineered_data.parquet"
df = spark.read.parquet(data_path)

# Sample
sampled_df = df.sample(False, 0.01).limit(5000)

# Predictions on the sampled data
predictions = model.transform(sampled_df)

# Convert 'flightDate' to a date type
predictions = predictions.withColumn("flightDate", to_date(predictions.flightDate))

# Convert predictions and selected data to Pandas DataFrame
predictions_pd = predictions.select("totalFare", "prediction", "flightDate").toPandas()
base_fare_travel_df = sampled_df.select("baseFare", "totalTravelDistance").toPandas()

# Plotting

# Scatter Plot of Actual vs. Predicted Prices
plt.figure(figsize=(10, 6))
sns.scatterplot(x='totalFare', y='prediction', data=predictions_pd)
plt.plot([predictions_pd['totalFare'].min(), predictions_pd['totalFare'].max()],
         [predictions_pd['totalFare'].min(), predictions_pd['totalFare'].max()], 'r--')
plt.xlabel('Actual Flight Prices')
plt.ylabel('Predicted Flight Prices')
plt.title('Actual vs Predicted Flight Prices')
plt.show()

# Error Distribution
plt.figure(figsize=(10, 6))
errors = predictions_pd['totalFare'] - predictions_pd['prediction']
sns.histplot(errors, kde=True)
plt.xlabel('Prediction Error')
plt.ylabel('Density')
plt.title('Prediction Error Distribution')
plt.show()

# Base Fare vs Total Travel Distance
plt.figure(figsize=(10, 6))
sns.scatterplot(x='baseFare', y='totalTravelDistance', data=base_fare_travel_df, color='red')
plt.title('Base Fare vs Total Travel Distance')
plt.xlabel('Base Fare ($)')
plt.ylabel('Total Travel Distance (miles)')
plt.show()

# Residual Plot
plt.figure(figsize=(10, 6))
sns.residplot(x='prediction', y='totalFare', data=predictions_pd, lowess=True)
plt.xlabel('Predicted Flight Prices')
plt.ylabel('Residuals (Actual - Predicted)')
plt.title('Residuals of Predicted Flight Prices')
plt.show()

# Correlation Heatmap
vector_col = "corr_features"
assembler = VectorAssembler(
    inputCols=["totalFare", "baseFare", "totalTravelDistance", "travelDurationMinutes", "seatsRemaining"],
    outputCol=vector_col
)
df_vector = assembler.transform(df).select(vector_col)
matrix = Correlation.corr(df_vector, vector_col).head()
correlation_matrix = matrix[0].toArray()
corr_pd = pd.DataFrame(correlation_matrix,
                       index=["totalFare", "baseFare", "totalTravelDistance", "travelDurationMinutes", "seatsRemaining"],
                       columns=["totalFare", "baseFare", "totalTravelDistance", "travelDurationMinutes", "seatsRemaining"])

plt.figure(figsize=(12, 8))
sns.heatmap(corr_pd, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Time Series Plot of Predicted vs. Actual Prices
plt.figure(figsize=(12, 8))
sns.lineplot(data=predictions_pd, x='flightDate', y='totalFare', label='Actual Fare', color='blue')
sns.lineplot(data=predictions_pd, x='flightDate', y='prediction', label='Predicted Fare', color='orange')
plt.xlabel('Flight Date')
plt.ylabel('Flight Prices')
plt.title('Time Series of Flight Prices')
plt.legend()
plt.show()