In [None]:
from datetime import date, datetime

import pandas as pd
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType, StringType, StructField, StructType

In [None]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows", 5)

### reading data

In [None]:
schema = StructType(
    [
        StructField("sepal_length", DoubleType(), True),
        StructField("sepal_width", DoubleType(), True),
        StructField("petal_length", DoubleType(), True),
        StructField("petal_width", DoubleType(), True),
        StructField("species", StringType(), True),
    ]
)
df = spark.read.csv("../data/iris.csv", header=False, schema=schema)

In [None]:
df

In [None]:
df.printSchema()

In [None]:
df.columns

In [None]:
df.count()

In [None]:
df.describe()

In [None]:
df.select(df.sepal_length)

In [None]:
filtered_df = df.filter((df.sepal_length > 5) & (df.sepal_width > 3))

In [None]:
filtered_df.count()

### do the equivalent of pandas `df.value_counts()`

In [None]:
filtered_df.groupBy("species").count().orderBy("count", ascending=False)

In [None]:
df.groupBy("species").count().orderBy("count", ascending=False)

### find the top decile petal area?

In [None]:
(
    df.withColumn("petal_area", col("petal_length") * col("petal_width"))
    .orderBy("petal_area", ascending=False)
    .show(df.count() // 10)
)

### find the top decile sepal area?

In [None]:
(
    df.withColumn("sepal_area", col("sepal_length") * col("sepal_width"))
    .orderBy("sepal_area", ascending=False)
    .show(df.count() // 10)
)