This Notebook is for removing outliers from our dataset. Currently we remove businesses that have a low number of orders (below the 1% quantile.) We then generate some distribution plots and compare it to our plots from distribution_analysis.ipynb. 

In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, to_date, when, concat, lit
import matplotlib.pyplot as plt


spark = SparkSession.builder \
    .appName("green preprocessing") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .config("spark.sql.parquet.cacheMetadata", "true") \
    .config("spark.sql.session.timeZone", "Etc/UTC") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [None]:
path1 = "../../../data/insights/joined.parquet"
df = spark.read.parquet(path1)
df.show(5)
df.printSchema()
print("length:",df.count())
df_pandas = df.toPandas()



In [None]:
import pandas as pd
import seaborn as sns
# Initialize an empty list to hold the outliers

for segment in df_pandas['segment'].unique():
    # Filter the dataframe for each segment
    segment_df = df_pandas[df_pandas['segment'] == segment]
    print(segment)
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='average_consumer_fraud_probability', y='number_of_orders', data=segment_df)
    plt.title(f'Segment: {segment}')
    plt.xlabel('Average Consumer Fraud Probability')
    plt.ylabel('Number of Orders')
    plt.show()


In [None]:
import pandas as pd
import seaborn as sns
# Initialize an empty list to hold the outliers

# Define a function to calculate upper bound outliers
def get_outliers(df, column_name):
    lower_bound = df[column_name].quantile(0.01)
    print(lower_bound)
    return df[df[column_name] <= lower_bound], lower_bound

outliers_list,lower_bound = get_outliers(df_pandas,'number_of_orders')
    
# Create a DataFrame from the outliers_list
outliers_df = pd.DataFrame(outliers_list)
display(outliers_df)
print("length: ",len(outliers_df))

In [None]:
grouped_outliers = outliers_df.groupby('segment').size().reset_index(name='count')
display(grouped_outliers)
display("outlier_merchant_fraud",outliers_df.groupby('segment')['average_merchant_fraud_probability'].describe().reset_index())
display("outlier_consumer_fraud",outliers_df.groupby('segment')['average_consumer_fraud_probability'].describe().reset_index())
display("cost of order",outliers_df.groupby('segment')['average_cost_of_order'].describe().reset_index())


In [None]:
columns_to_plot = ['average_merchant_fraud_probability', 'average_consumer_fraud_probability','average_cost_of_order']

for col in columns_to_plot + ['number_of_orders']:
    plt.figure(figsize=(8,5))
    sns.histplot(outliers_df[col], kde=True, bins=30)  # you can adjust the number of bins as needed
    plt.title(f'Outliers Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
df_pandas_cleaned = df_pandas.loc[df_pandas['number_of_orders'] > lower_bound]
print("outliers removed length:", len(df_pandas_cleaned))

In [None]:
# Loop through each segment and plot
for segment in df_pandas_cleaned['segment'].unique():
    segment_df = df_pandas_cleaned[df_pandas_cleaned['segment'] == segment]

    # Filter the cleaned dataframe for each segment    
    # Plotting the scatter plot for merchant fraud
    plt.figure(figsize=(5,3))
    sns.scatterplot(x='average_consumer_fraud_probability', y='number_of_orders', data=segment_df)
    plt.title(f'Consumer Fraud: Segment {segment} (After Removing Outliers)')
    plt.xlabel('Average Consumer Fraud Probability')
    plt.ylabel('Number of Orders')
    plt.show()


In [None]:
# List of columns to analyze
columns_to_analyze = [
    "average_consumer_fraud_probability", 
    "avg_median_age", 
    "avg_total_weekly_personal_income", 
    "take_rate", 
    "average_cost_of_order", 
    "number_of_orders", 
    "number_of_unique_consumers",
    "average_merchant_fraud_probability",
    "average_consumer_fraud_probability", 
]

df_pandas = df_pandas_cleaned
# For each column, calculate summary statistics and plot the distribution
for column in columns_to_analyze:
    summary_stats = df_pandas[column].describe()
    print(f"Summary Statistics for {column}:\n")
    print(summary_stats)
    print("\n" + "-"*50 + "\n")

    # Plot Distribution
    plt.figure(figsize=(12, 6))
    plt.hist(df_pandas[column], bins=100, color='#86bf91', rwidth=0.8)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Number of Merchants')
    plt.grid(axis='y', alpha=0.75)
    plt.tight_layout()
    plt.show()

In [None]:
df = spark.createDataFrame(df_pandas_cleaned)
agg_data = df.groupBy("segment").agg(
    F.mean("average_consumer_fraud_probability").alias("mean_consumer_fraud"),
    F.stddev("average_consumer_fraud_probability").alias("stddev_consumer_fraud"),
    F.mean("number_of_orders").alias("mean_orders"),
    F.stddev("number_of_orders").alias("stddev_orders")
)

# Convert the aggregated data to Pandas DataFrame for visualization
agg_data_pandas = agg_data.toPandas()

# Plotting
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Average consumer fraud probability by segment
sns.barplot(x="segment", y="mean_consumer_fraud", data=agg_data_pandas, ax=axes[0])
axes[0].set_title('Average Consumer Fraud Probability by Segment')
axes[0].set_ylabel('Mean Consumer Fraud Probability')

# Number of orders by segment
sns.barplot(x="segment", y="mean_orders", data=agg_data_pandas, ax=axes[1])
axes[1].set_title('Average Number of Orders by Segment')
axes[1].set_ylabel('Mean Number of Orders')

plt.tight_layout()
plt.show()

In [None]:
# Group by segment and aggregate total sales for each segment
agg_sales = df.groupBy("segment").agg(
    F.sum(df.number_of_orders * df.average_cost_of_order).alias("total_sales")
)

# Convert the aggregated data to a Pandas DataFrame for visualization
sales_pandas = agg_sales.toPandas()

# Plotting
plt.figure(figsize=(12, 8))
plt.pie(sales_pandas["total_sales"], labels=sales_pandas["segment"], autopct='%1.1f%%', startangle=140, colors=sns.color_palette("Set3", len(sales_pandas)))
plt.title('Total Sales by Segment')
plt.show()

In [None]:
df.write.mode('overwrite').parquet("../../../data/curated/removed_outliers.parquet")


In [None]:
spark.stop()