In [None]:
from pyspark.sql import SparkSession, functions as f
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
spark = (
    SparkSession.builder.appName("Preprocessing_Yellow")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

In [None]:
joined = spark.read.parquet("../../../data/curated/removed_outliers.parquet")

In [None]:
joined.show()

In [None]:
joined.select("average_cost_of_order").describe()

In [None]:
joined.select("number_of_unique_consumers").describe()

In [None]:
joined.select("number_of_repeat_consumers").describe()

In [None]:
joined.select("number_of_postcodes").describe()

In [None]:
joined_pd = joined.toPandas()

In [None]:
joined_pd.columns

In [None]:
numerical_columns = joined_pd[['take_rate',
       'average_merchant_fraud_probability', 'number_of_unique_consumers',
       'average_consumer_fraud_probability', 'number_of_repeat_consumers',
       'number_of_orders', 'average_cost_of_order', 'number_of_postcodes',
       'avg_total_weekly_personal_income', 'avg_total_weekly_fam_income',
       'avg_median_age', 'avg_num_of_consumers_per_postcode']]

In [None]:
numerical_columns.corr()

In [None]:
sns.heatmap(numerical_columns.corr())

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(joined_pd['number_of_unique_consumers'], joined_pd['number_of_repeat_consumers'], color='blue', label='Number of Consumers', alpha=0.5)
plt.title('Number of Consumers vs Number of Repeat Consumers')
plt.xlabel('Number of Consumers')
plt.ylabel('Number of Repeat Consumers')
plt.legend()
plt.grid(True)

In [None]:
joined.select("number_of_orders").describe()

In [None]:
spark.stop()