In [1]:
import os
os.environ["PYSPARK_PYTHON"]="./anova_venv/bin/python3"
os.environ['PYSPARK_DRIVER_PYTHON']="./anova_venv/bin/python3"

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Anova_Demo").getOrCreate()

## Load dataset

In [3]:
fn = "anova_demo.csv"
df_anova = spark.read.option("header", "true") \
    .option("delimiter", ",") \
    .option("inferSchema", "true") \
    .option("mode", "DROPMALFORMED") \
    .csv(fn)

In [4]:
df_anova.printSchema()

root
 |-- OF: integer (nullable = true)
 |-- algorithm: string (nullable = true)



In [5]:
df_anova.describe().show()

+-------+------------------+---------+
|summary|                OF|algorithm|
+-------+------------------+---------+
|  count|              1200|     1200|
|   mean|            281.86|     null|
| stddev|102.07415447037693|     null|
|    min|                82|      CBC|
|    max|               658|   NEWCBC|
+-------+------------------+---------+



Data is loaded correctly! Let's now import the Anova function and test it on the demo DataFrame

## Import and test ANOVA function

If the import doesn't work create an empty file named '\__init\__.py' at the same path level

In [6]:
from anova_in_spark import one_way_anova

Now the custom one-way ANOVA function can be used:

In [7]:
sswg, ssbg, f_statistic, df1, df2 = one_way_anova(df_anova, 'algorithm','OF')
f_statistic

9.126587788027538e-05

In [8]:
df1

2

In [9]:
df2

1197

Just for double-checking the results, I will compare the results with the SciPy implementation's of the one-way ANOVA.

In [10]:
from scipy import stats

df_anova_pd = df_anova.toPandas()
stats.f_oneway(df_anova_pd['OF'][df_anova_pd['algorithm'] == 'MODEL'], 
             df_anova_pd['OF'][df_anova_pd['algorithm'] == 'CBC'],
             df_anova_pd['OF'][df_anova_pd['algorithm'] == 'NEWCBC'])

F_onewayResult(statistic=9.1265877880229e-05, pvalue=0.9999087382936496)

The results match, therefore the custom ANOVA works as expected!