In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import catboost as cb
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [0]:
import pandas as pd
import joblib
from pyspark.sql.functions import col, to_date, month, dayofweek, year
import logging

def predict_modal_price(df, pipeline, 
                       date_column: str = 'DATE_OF_PRICING'):
    """
    Predict modal prices using Spark DataFrame operations for feature engineering.
    
    Args:
        df (pyspark.sql.DataFrame): Input Spark DataFrame with pricing data
        pipeline (joblib.base.BaseEstimator): Pre-trained prediction model
        date_column (str): Name of the date column (default: 'DATE_OF_PRICING')
    
    Returns:
        pyspark.sql.DataFrame: DataFrame with predictions and engineered features
    """
    try:
        df = df.withColumn(date_column, to_date(col(date_column)))

        df = df.withColumn('month', month(col(date_column))) \
               .withColumn('dayofweek', dayofweek(col(date_column))) \
               .withColumn('year', year(col(date_column)))
        pandas_df = df.toPandas()
        pandas_df['MODAL_PRICE_PREDICT'] = pipeline.predict(pandas_df)
        result_df = spark.createDataFrame(pandas_df)
        
        return result_df
        
    except Exception as e:
        logging.error(f"Error in predict_modal_price: {str(e)}")
        raise

def main():
    logging.basicConfig(level=logging.INFO)
    try:
        pipeline = joblib.load("best_price_prediction_model.joblib")
        df = spark.sql("""
            SELECT * 
            FROM pricing_analytics.gold.datalake_price_prediction_gold 
            WHERE DATE_OF_PRICING IS NOT NULL
        """)
        df_result = predict_modal_price(df, pipeline)
        # Save results to Delta table
        df_result.write \
            .format("delta") \
            .mode("overwrite") \
            .saveAsTable("pricing_analytics.gold.datalake_price_predictions")
        
        return df_result
        
    except Exception as e:
        logging.error(f"Error in main: {str(e)}")
        raise


df_result = main()

In [0]:
display(df_result)