# Dynamically generate schemas from an existing DataFrame 

Besides loading a `DataSet` from an existing table, we also provide `create_schema()`, which generates a `Schema` from a `DataFrame` that you have in memory. This allows you to get autocomplete on `DataSets` that you create on-the-fly. A great example is a pivot table.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.Builder().config("spark.ui.showConsoleProgress", "false").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [2]:
from datetime import timedelta, datetime
from pyspark.sql.functions import first
from pyspark.sql.types import LongType, StringType
from typedspark import Column, Schema, create_partially_filled_dataset, create_schema

spark = SparkSession.builder.getOrCreate()
date = datetime(2023, 10, 2)


class Vaccinations(Schema):
    vaccination_id: Column[LongType]
    pet_id: Column[LongType]
    vaccine_name: Column[StringType]
    vaccine_date: Column[DateType]
    next_due_date: Column[DateType]


vaccinations = create_partially_filled_dataset(
    spark,
    Vaccinations,
    {
        Vaccinations.vaccination_id: [1, 2, 3, 4, 5, 6, 7],
        Vaccinations.pet_id: [1, 2, 3, 1, 3, 2, 3],
        Vaccinations.vaccine_name: [
            "rabies",
            "rabies",
            "rabies",
            "lyme",
            "lyme",
            "influenza",
            "influenza",
        ],
        Vaccinations.next_due_date: [
            date + timedelta(days=32),
            date + timedelta(days=6),
            date + timedelta(days=12),
            date + timedelta(days=15),
            date + timedelta(days=2),
            date + timedelta(days=1),
            date + timedelta(days=3),
        ],
        Vaccinations.vaccine_date: [
            date + timedelta(days=32) - timedelta(days=365),
            date + timedelta(days=6) - timedelta(days=365),
            date + timedelta(days=12) - timedelta(days=365),
            date + timedelta(days=15) - timedelta(days=365),
            date + timedelta(days=2) - timedelta(days=365),
            date + timedelta(days=1) - timedelta(days=365),
            date + timedelta(days=3) - timedelta(days=365),
        ],
    },
)
vaccinations.show()


from pyspark.sql.types import IntegerType, StringType

from typedspark import Column, Schema


class PivotTable(Schema):
    id: Column[IntegerType]
    a: Column[StringType]
    b__: Column[StringType]
    c: Column[StringType]

We can use this as a regular `Schema`:

In [None]:
pivot = (
    vaccinations.groupby(Vaccinations.pet_id)
    .pivot(Vaccinations.vaccine_name.str)
    .agg(first(Vaccinations.next_due_date))
)

pivot, Pivot = create_schema(pivot)
pivot.show()

In [None]:
Pivot

In [3]:
pivot.filter(Pivot.influenza.isNotNull()).show()

+---+-----+-----+-----+
| id|    a|  b__|    c|
+---+-----+-----+-----+
|  1|alpha|alpha| beta|
|  3|alpha| beta|gamma|
+---+-----+-----+-----+

