# Libraries


In [174]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col,
    substring,
    input_file_name,
    current_date,
    year,
    like,
    asc,
    desc,
    count,
    count_distinct,
    max,
    min,
    avg,
    array_agg,
    array_distinct,
    array_contains,
    row_number,
    rank,
    dense_rank,
    ntile,
    sum,
    lead,
    lag,
    cume_dist,
)


from pyspark.sql.types import (
    IntegerType,
    LongType,
    StructField,
    StructType,
    DateType,
    DoubleType,
    StringType,
    TimestampType,
)

# Spark Session


In [125]:
builder = SparkSession.builder.config(
    "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension"
).config(
    "spark.sql.catalog.spark_catalog",
    "org.apache.spark.sql.delta.catalog.DeltaCatalog",
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Loading Data into spark


In [126]:
my_user_schema = StructType(
    [
        StructField("index", IntegerType(), nullable=False),
        StructField("organization_id", StringType(), nullable=False),
        StructField("name", StringType(), nullable=False),
        StructField("website", StringType(), nullable=False),
        StructField("country", StringType(), nullable=False),
        StructField("description", StringType(), nullable=False),
        StructField("founded", IntegerType(), nullable=False),
        StructField("industry", StringType(), nullable=False),
        StructField("employee_no", IntegerType(), nullable=False),
    ]
)

users_df = (
    spark.read.option("header", True)
    .schema(my_user_schema)
    .csv("./datasets/organizations-2000000.csv")
)


click_data_schema = StructType(
    [
        StructField("session_id", IntegerType(), nullable=False),
        StructField("IPID", IntegerType(), nullable=False),
        StructField("timestamp", TimestampType(), nullable=False),
        StructField("VHOST", StringType(), nullable=False),
        StructField("URL_FILE", StringType(), nullable=False),
        StructField("PAGE_NAME", StringType(), nullable=False),
        StructField("REF_URL_category", StringType(), nullable=False),
        StructField("page_load_error", IntegerType(), nullable=False),
        StructField("page_action_detail", StringType(), nullable=False),
        StructField("tip", StringType(), nullable=False),
        StructField("service_detail", StringType(), nullable=False),
        StructField("xps_info", StringType(), nullable=False),
        StructField("page_action_detail_EN", StringType(), nullable=False),
        StructField("service_detail_EN", StringType(), nullable=False),
        StructField("tip_EN", StringType(), nullable=False),
    ]
)

click_data_df = (
    spark.read.option("header", True)
    .option("delimiter", ";")
    .schema(click_data_schema)
    .csv("./datasets/BPI2016_Clicks_NOT_Logged_In.csv")
)

# Converting dataframes to delta tables


In [127]:
# users_df.write.mode("overwrite").format("delta").save("./output/users_df_delta")
# click_data_df.write.mode("overwrite").format("delta").save(
#     "./output/click_data_df_delta"
# )

# Querying delta tables


In [128]:
spark.sql(
    """
DESCRIBE HISTORY delta.`D:\\development\\learn_spark\\output\\users_df_delta`;
"""
).show()


spark.sql(
    """
DESCRIBE HISTORY delta.`D:\\development\\learn_spark\\output\\click_data_df_delta`;
"""
).show()

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      1|2024-01-17 14:01:...|  NULL|    NULL| OPTIMIZE|{predicate -> [],...|NULL|    NULL|     NULL|          0|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Apache-Spark/3.5....|
|      0|2024-01-17 13:54:...|  NULL|    NULL|    WRITE|{mode -> Overwrit...|NULL|    NULL|     NULL|       NULL|     Serializable|        false|{numFiles -> 8, n...|        NULL|Apache-Spark/3.5.

# Performing EDA on data


In [129]:
spark.read.format("delta").load("./output/click_data_df_delta/").printSchema()
spark.read.format("delta").load("./output/click_data_df_delta/").show(5)

spark.read.format("delta").load("./output/users_df_delta/").printSchema()
spark.read.format("delta").load("./output/users_df_delta/").show(5)

root
 |-- session_id: integer (nullable = true)
 |-- IPID: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- VHOST: string (nullable = true)
 |-- URL_FILE: string (nullable = true)
 |-- PAGE_NAME: string (nullable = true)
 |-- REF_URL_category: string (nullable = true)
 |-- page_load_error: integer (nullable = true)
 |-- page_action_detail: string (nullable = true)
 |-- tip: string (nullable = true)
 |-- service_detail: string (nullable = true)
 |-- xps_info: string (nullable = true)
 |-- page_action_detail_EN: string (nullable = true)
 |-- service_detail_EN: string (nullable = true)
 |-- tip_EN: string (nullable = true)

+----------+-------+--------------------+-------------+--------------------+------------------+----------------+---------------+------------------+----+--------------+--------------------+---------------------+-----------------+------+
|session_id|   IPID|           timestamp|        VHOST|            URL_FILE|         PAGE_NAME|REF_URL_catego

## Optimizations

1. In first dataset it is a good choice to z-order the dataset by page name so the records corresponding to the same page are co-located
2. In the second dataset the same logic as point 1 is there for the country attribute.


In [130]:
# spark.sql(
#     """
# OPTIMIZE delta.`D:\\development\\learn_spark\\output\\users_df_delta` ZORDER BY (country);
# """
# ).show()
# spark.sql(
#     """
# OPTIMIZE delta.`D:\\development\\learn_spark\\output\\click_data_df_delta` ZORDER BY (PAGE_NAME);
# """
# ).show()

In [131]:
spark.sql(
    """
DESCRIBE HISTORY delta.`D:\\development\\learn_spark\\output\\users_df_delta`;
"""
).show(truncate=False)


spark.sql(
    """
DESCRIBE HISTORY delta.`D:\\development\\learn_spark\\output\\click_data_df_delta`;
"""
).show(truncate=False)

+-------+-----------------------+------+--------+---------+------------------------------------------+----+--------+---------+-----------+-----------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                       |job |notebook|clusterId|readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                                                                                                                                                                                      |userMetadata|engineInfo                         |
+-------+-----------------------+-

# Loading delta tables in spark dataframe


In [132]:
user_delta_table_v1 = (
    spark.read.format("delta").option("versionAsOf", 0).load("./output/users_df_delta/")
)
user_delta_table_v2 = (
    spark.read.format("delta").option("versionAsOf", 1).load("./output/users_df_delta/")
)

In [133]:
user_delta_table_v2.createOrReplaceTempView("user_delta_table_v2_tv")

In [134]:
user_delta_table_v2.show(2, vertical=True, truncate=False)

-RECORD 0----------------------------------------
 index           | 1                             
 organization_id | 391dAA77fea9EC1               
 name            | Daniel-Mcmahon                
 website         | https://stuart-rios.biz/      
 country         | Cambodia                      
 description     | Focused eco-centric help-desk 
 founded         | 2013                          
 industry        | Sports                        
 employee_no     | 1878                          
-RECORD 1----------------------------------------
 index           | 2                             
 organization_id | 9FcCA4A23e6BcfA               
 name            | Mcdowell, Tate and Murray     
 website         | http://jacobs.biz/            
 country         | Guyana                        
 description     | Front-line real-time portal   
 founded         | 2018                          
 industry        | Legal Services                
 employee_no     | 9743                          


# Perfrom Group By and aggregations


In [135]:
# spark.sql(
#     """
#     SELECT
#     name,
#     count(distinct country) as distinct_country_count,
#     count(distinct employee_no) as distinct_no_of_employees
#     from user_delta_table_v2_tv
#     group by name
#     order by distinct_no_of_employees desc;
#     """
# ).show(5)


spark.sql(
    """
    SELECT
    
    employee_no,
    count(employee_no) as employee_count,
    array_distinct(array_agg(country)) as country_list,
    array_contains(array_agg(country), 'India') as contains_india

    from user_delta_table_v2_tv 
    group by employee_no
    order by employee_count desc;
    """
).show(5)


# spark.sql(
#     """
#     SELECT * from user_delta_table_v2_tv where name like '%Bradley PLC%';
#     """
# ).show(5)

+-----------+--------------+--------------------+--------------+
|employee_no|employee_count|        country_list|contains_india|
+-----------+--------------+--------------------+--------------+
|       4846|           261|[Samoa, Malta, Ja...|          true|
|       2384|           251|[Slovakia (Slovak...|          true|
|       3390|           250|[Rwanda, Uruguay,...|         false|
|       1589|           249|[Iran, Puerto Ric...|          true|
|       1961|           247|[Macedonia, Malay...|         false|
+-----------+--------------+--------------------+--------------+
only showing top 5 rows



In [136]:
user_delta_table_v2.groupBy("employee_no").agg(
    count(user_delta_table_v2.employee_no).alias("employee_no_count"),
    array_distinct(array_agg(user_delta_table_v2.country)).alias(
        "distinct_country_list"
    ),
    array_contains(array_agg(user_delta_table_v2.country), "India").alias(
        "is_india_a_part"
    ),
    avg(user_delta_table_v2.employee_no).alias("average_employees"),
).sort(desc("employee_no_count")).show(5)

+-----------+-----------------+---------------------+---------------+-----------------+
|employee_no|employee_no_count|distinct_country_list|is_india_a_part|average_employees|
+-----------+-----------------+---------------------+---------------+-----------------+
|       4846|              261| [Samoa, Malta, Ja...|           true|           4846.0|
|       2384|              251| [Slovakia (Slovak...|           true|           2384.0|
|       3390|              250| [Rwanda, Uruguay,...|          false|           3390.0|
|       1589|              249| [Iran, Puerto Ric...|           true|           1589.0|
|       7828|              247| [Saint Vincent an...|           true|           7828.0|
+-----------+-----------------+---------------------+---------------+-----------------+
only showing top 5 rows



# Windowing on data


In [163]:
ds_salaries_df = spark.read.options(header=True, inferSchema=True).csv(
    "./datasets/ds_salaries.csv"
)
ds_salaries_df.createOrReplaceTempView("ds_salaries_tv")

## Available functions

- Simple aggregation functions: avg, sum, min, max, count
- Row-wise ordering and ranking functions: row_number, rank, dense_rank, percent_rank, ntile (divides data into n buckets)
- Creating lagged columns: lag, lead
- Combining Windows and Calling Functions: over
- Analytic functions: cume_dist, first_value, last_value, nth_value
- Aggregate functions: collect_list, collect_set, corr, covar_pop, covar_samp, stddev, stddev_pop, stddev_samp, variance, var_pop, var_samp


In [197]:
spark.sql(
    """
select

tab.*,


ROW_NUMBER() OVER(PARTITION BY job_title ORDER BY salary) as row_no,
RANK() OVER(PARTITION BY job_title ORDER BY salary) as rank,
dense_rank() OVER(PARTITION BY job_title ORDER BY salary) as dense_rank,
PERCENT_RANK() OVER(PARTITION BY job_title ORDER BY salary) as percent_rank,
NTILE(3) OVER(PARTITION BY job_title ORDER BY salary) as ntile,
lead(salary, 1, 404) OVER(PARTITION BY job_title ORDER BY salary) as lead,
lag(salary, 1, 404) OVER(PARTITION BY job_title ORDER BY salary) as lag,

avg(salary) OVER(PARTITION BY job_title ORDER BY salary) as avg,
min(salary) OVER(PARTITION BY job_title ORDER BY salary) as min,
max(salary) OVER(PARTITION BY job_title ORDER BY salary) as max,
sum(salary) OVER(PARTITION BY job_title ORDER BY salary) as sum,
cume_dist() OVER(PARTITION BY job_title ORDER BY salary) as cumdist


from ds_salaries_tv as tab;
"""
).show(5)

+---+---------+----------------+---------------+--------------------+------+---------------+-------------+------------------+------------+----------------+------------+------+----+----------+------------------+-----+------+-----+------------------+------+------+------+-------------------+
|_c0|work_year|experience_level|employment_type|           job_title|salary|salary_currency|salary_in_usd|employee_residence|remote_ratio|company_location|company_size|row_no|rank|dense_rank|      percent_rank|ntile|  lead|  lag|               avg|   min|   max|   sum|            cumdist|
+---+---------+----------------+---------------+--------------------+------+---------------+-------------+------------------+------------+----------------+------------+------+----+----------+------------------+-----+------+-----+------------------+------+------+------+-------------------+
| 77|     2021|              MI|             PT|3D Computer Visio...|400000|            INR|         5409|                IN|     

In [182]:
windowSpec = Window.partitionBy("job_title").orderBy("salary")


ds_salaries_df.withColumn("row_number", row_number().over(windowSpec)).withColumn(
    "rank", rank().over(windowSpec)
).withColumn("dense_rank", dense_rank().over(windowSpec)).withColumn(
    "avg", avg(col("salary")).over(windowSpec)
).withColumn(
    "min", min(col("salary")).over(windowSpec)
).withColumn(
    "max", max(col("salary")).over(windowSpec)
).withColumn(
    "sum", sum(col("salary")).over(windowSpec)
).withColumn(
    "lead", lead("salary", 2).over(windowSpec)
).withColumn(
    "lag", lag("salary", 2).over(windowSpec)
).withColumn(
    "cume_dist", cume_dist().over(windowSpec)
).show(
    50
)

+---+---------+----------------+---------------+--------------------+--------+---------------+-------------+------------------+------------+----------------+------------+----------+----+----------+------------------+------+--------+--------+--------+------+-------------------+
|_c0|work_year|experience_level|employment_type|           job_title|  salary|salary_currency|salary_in_usd|employee_residence|remote_ratio|company_location|company_size|row_number|rank|dense_rank|               avg|   min|     max|     sum|    lead|   lag|          cume_dist|
+---+---------+----------------+---------------+--------------------+--------+---------------+-------------+------------------+------------+----------------+------------+----------+----+----------+------------------+------+--------+--------+--------+------+-------------------+
| 77|     2021|              MI|             PT|3D Computer Visio...|  400000|            INR|         5409|                IN|          50|              IN|         

# Theory Data Warehousing Technologies


PolyBase is a technology that enables the integration and querying of both relational and non-relational data across on-premises and cloud environments in a seamless manner. It was originally developed by Microsoft and is part of their data platform offerings, including SQL Server and Azure Synapse Analytics (formerly SQL Data Warehouse).

Key features and aspects of PolyBase include:

1. **Unified Query Processing:**
   PolyBase allows users to query data stored in relational databases, data warehouses, and non-relational data sources using a standard SQL interface. This unified query processing enables the integration of diverse data types.

2. **Data Virtualization:**
   With PolyBase, you can create external tables that reference data stored in external sources without physically moving the data. This data virtualization feature makes it possible to query and join data from different sources as if it were stored in a single location.

3. **Parallel Processing:**
   PolyBase leverages parallel processing capabilities to enhance performance. It allows for the parallel execution of queries across distributed data sources, making it well-suited for large-scale analytics and data warehousing.

4. **Integration with Hadoop:**
   Originally, PolyBase was designed to integrate with Hadoop-based systems. It can read and write data directly to Hadoop Distributed File System (HDFS), providing a bridge between SQL-based systems and the Hadoop ecosystem.

5. **Support for Azure Blob Storage and Azure Data Lake Storage:**
   In addition to Hadoop integration, PolyBase also supports Azure Blob Storage and Azure Data Lake Storage, allowing users to seamlessly query and analyze data stored in Azure cloud storage.

6. **Azure Synapse Analytics Integration:**
   PolyBase is a key component of Azure Synapse Analytics, where it is used to integrate data stored in data lakes, data warehouses, and other sources. It enables users to perform analytics and reporting across various data platforms within the Azure ecosystem.

7. **External Tables and External Data Sources:**
   Users define external tables in SQL Server or Azure Synapse Analytics that reference data stored externally. External data sources are used to establish a connection to external data storage locations.

8. **Security:**
   PolyBase includes security features to ensure that sensitive data is protected. It supports authentication mechanisms and encryption for data in transit.

PolyBase is particularly beneficial in scenarios where organizations have a mix of structured and unstructured data stored in different locations, and they need a unified approach to query and analyze that data. It simplifies the process of working with data across diverse data platforms, providing a more integrated and efficient data analytics experience.


In Microsoft Azure, hierarchy often refers to the organization of resources within Azure Active Directory (Azure AD) or the structure of resources within Azure Resource Manager (ARM) for managing and organizing resources. Let's explore both aspects:

### 1. **Azure Active Directory Hierarchy:**

Azure AD is Microsoft's cloud-based identity and access management service. In the context of Azure AD, hierarchy can refer to the structure of directories and the relationships between various entities. Key components related to hierarchy in Azure AD include:

- **Directories (Tenants):** Azure AD supports multiple directories, also known as tenants. Each directory is an isolated instance with its own users, groups, and applications. The hierarchy here involves the relationships between different directories.

- **Users and Groups:** Within each directory, you have users and groups. Users can be organized into groups to simplify access management. Groups can also be nested to form a hierarchical structure.

- **Roles:** Azure AD provides built-in roles and custom roles that grant users specific permissions. Roles can be assigned at different levels, such as the directory level, subscription level, or resource group level, contributing to the hierarchy of access control.

### 2. **Azure Resource Manager (ARM) Hierarchy:**

Azure Resource Manager is the deployment and management service in Azure that allows you to work with resources as a group. In ARM, hierarchy refers to the structure of resources within a subscription and resource groups. Key concepts related to hierarchy in ARM include:

- **Subscription:** The top-level container for resources in Azure. Subscriptions are associated with billing and access control boundaries.

- **Resource Groups:** Resources are organized into resource groups, which are logical containers that help manage and organize resources. Resource groups can be considered as the next level of hierarchy under a subscription.

- **Resources:** Azure resources, such as virtual machines, storage accounts, databases, etc., are organized within resource groups. Each resource type has its own set of properties and configurations.

- **Management Groups:** Azure management groups are a way to organize subscriptions within an organization. They provide a level of hierarchy above subscriptions and allow for centralized policy enforcement and governance.

### Example (ARM Hierarchy):

- Subscription (e.g., "MyAzureSubscription")
  - Resource Group (e.g., "RG-WebApp")
    - App Service (e.g., "MyWebApp")
    - SQL Database (e.g., "MyDatabase")
  - Resource Group (e.g., "RG-Storage")
    - Storage Account (e.g., "MyStorageAccount")

The hierarchy in ARM allows for logical grouping of resources, making it easier to manage and organize resources within a subscription.

In summary, hierarchy in Microsoft Azure encompasses the organization and relationships between resources in both Azure Active Directory and Azure Resource Manager. Understanding these hierarchies is crucial for effective resource management, access control, and governance in the Azure cloud environment.


| Feature                                   | Azure Data Factory                                                                                                                     | Azure Synapse Analytics                                                                                                  |
| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
| **Purpose**                               | Data integration and orchestration service for ETL (Extract, Transform, Load) workflows.                                               | Integrated analytics service that brings big data and data warehousing together.                                         |
| **Primary Use Case**                      | Moving and transforming data across various data stores, both on-premises and in the cloud.                                            | Data warehousing, big data analytics, and data integration within a unified environment.                                 |
| **Data Processing Engine**                | Uses Azure Data Factory Managed Clusters for ETL processing.                                                                           | Combines on-demand and provisioned resources with MPP (Massively Parallel Processing) architecture.                      |
| **Data Integration Capabilities**         | Supports diverse data sources and destinations, including cloud and on-premises databases, flat files, and more.                       | Integrates data across various sources, including data lakes and data warehouses.                                        |
| **Transformation Capabilities**           | Provides data wrangling and transformation using Mapping Data Flow.                                                                    | Offers data transformations through PolyBase, T-SQL, and various built-in functions.                                     |
| **Scale**                                 | Scales vertically by adjusting the Data Factory configuration.                                                                         | Scales horizontally by adding more Data Warehouse Units (DWUs) in Synapse Analytics.                                     |
| **Query Language**                        | Uses Azure Data Factory Data Flow for transformations, and supports various languages for data movement and transformation activities. | Utilizes T-SQL (Transact-SQL) for querying data in Synapse SQL Pools.                                                    |
| **Integration with Other Azure Services** | Integrates with various Azure services, including Azure Storage, Azure SQL Database, Azure Databricks, and more.                       | Deep integration with Azure services, such as Azure Machine Learning, Power BI, and Azure Purview.                       |
| **Data Security**                         | Provides security features such as Managed Virtual Network, Data Encryption, and Managed Private Endpoints.                            | Implements security features like Transparent Data Encryption (TDE), Row-level Security (RLS), and dynamic data masking. |
| **Cost Model**                            | Pay-as-you-go pricing based on the number of data movement and data transformation activities.                                         | Consumption-based pricing with separate costs for data storage and data processing.                                      |
| **Management and Monitoring**             | Offers Azure Monitor integration for monitoring and managing pipelines.                                                                | Includes Azure Monitor for performance monitoring and Azure Security Center for security monitoring.                     |
| **Scalability**                           | Scalable, but scaling requires adjusting the configuration of the Data Factory.                                                        | Highly scalable using on-demand resources and the ability to adjust DWUs based on workload demands.                      |
| **Real-time Analytics**                   | Limited support for real-time analytics.                                                                                               | Supports real-time analytics through features like streaming ingest and PolyBase external tables.                        |
| **Hybrid Cloud Support**                  | Supports hybrid cloud scenarios for connecting on-premises data sources.                                                               | Designed for cloud-native analytics but can connect to on-premises data sources using PolyBase.                          |


Certainly! Below is a detailed comparison between transactional tables and dimensional tables presented in a tabular format:

| Feature                 | Transactional Tables                                        | Dimensional Tables                                              |
| ----------------------- | ----------------------------------------------------------- | --------------------------------------------------------------- |
| **Purpose**             | Capture and store detailed, real-time transactional data.   | Organize data for analysis and reporting in a data warehouse.   |
| **Granularity**         | Fine-grained with individual transactions.                  | Coarser-grained with aggregated or summarized data.             |
| **Data Volume**         | Typically large due to detailed transactional records.      | Relatively smaller compared to transactional tables.            |
| **Schema Design**       | Often normalized to reduce redundancy.                      | Typically denormalized for simplicity and performance.          |
| **Structure**           | Typically contain many attributes and relationships.        | Usually contain fewer attributes, focused on descriptive data.  |
| **Updates and Inserts** | Frequent updates and inserts for real-time data.            | Relatively stable with occasional batch updates.                |
| **Indexes**             | Often have indexes to optimize transactional queries.       | May have indexes, but the focus is on query performance.        |
| **Query Complexity**    | Complex queries to retrieve specific transactional details. | Simplified queries for reporting and analysis.                  |
| **Historical Data**     | May keep historical data for audit or compliance purposes.  | May keep historical data for tracking changes over time.        |
| **Join Operations**     | Frequently involve multiple joins for detailed analysis.    | Less reliance on joins; more focus on star or snowflake schema. |
| **Commonly Used in**    | OLTP (Online Transaction Processing) systems.               | OLAP (Online Analytical Processing) systems.                    |
| **Examples**            | Order details, transaction records, customer interactions.  | Customer dimension, product dimension, time dimension.          |

This table outlines the key differences between transactional tables, which are optimized for capturing and processing individual transactions in real-time, and dimensional tables, which are designed for analytical reporting and data analysis in a data warehouse environment. The choice between the two types depends on the specific needs of the application or analytical workload.


# Star Schema and Snowflake Schema

## What is star Schema ?

A `star schema` is a multi-dimensional data model used to organize data in a database so that it is easy to understand and analyze. Star schemas can be applied to data warehouses, databases, data marts, and other tools. The star schema design is optimized for querying large data sets.

Introduced by Ralph Kimball in the 1990s, star schemas are efficient at storing data, maintaining history, and updating data by reducing the duplication of repetitive business definitions, making it fast to aggregate and filter data in the data warehouse.

<img src="https://www.databricks.com/wp-content/uploads/2022/04/star-schema-erd.png">

### Fact tables and dimension tables

A star schema is used to denormalize business data into dimensions (like time and product) and facts (like transactions in amounts and quantities).

A star schema has a single fact table in the center, containing business "facts" (like transaction amounts and quantities). The fact table connects to multiple other dimension tables along "dimensions" like time, or product. Star schemas enable users to slice and dice the data however they see fit, typically by joining two or more fact tables and dimension tables together.

### Denormalized data

Star schemas denormalize the data, which means adding redundant columns to some dimension tables to make querying and working with the data faster and easier. The purpose is to trade some redundancy (duplication of data) in the data model for increased query speed, by avoiding computationally expensive join operations.

In this model, the fact table is normalized but the dimensions tables are not. That is, data from the fact table exists only on the fact table, but dimensional tables may hold redundant data.

### Benefits of star schemas

1. Fact/dimensional models like star schemas are simple to understand and implement, and make it easy for end users to find the data they need. They can be applied to data marts and other data resources.
2. Great for simple queries because of their reduced dependency on joins when accessing the data, as compared to normalized models like snowflake schemas.
3. Adapt well to fit OLAP models.
4. Improved query performance as compared to normalized data, because star schemas attempt to avoid computationally expensive joins.

### How does a star schema differ from 3NF (Third Normal Form)?

3NF, or Third Normal Form, is a method of reducing data-redundancy through normalization. It is a common standard for databases that are considered fully normalized. It typically has more tables than a star schema due to data normalization. On the flip-side, queries tend to be more complex due to the increased number of joins between large tables.

## What is a snowflake schema?

A snowflake schema is a multi-dimensional data model that is an extension of a star schema, where dimension tables are broken down into subdimensions. Snowflake schemas are commonly used for business intelligence and reporting in OLAP data warehouses, data marts, and relational databases.

In a snowflake schema, engineers break down individual dimension tables into logical subdimensions. This makes the data model more complex, but it can be easier for analysts to work with, especially for certain data types.

It's called a snowflake schema because its entity-relationship diagram (ERD) looks like a snowflake, as seen below.

<img src = "https://cms.databricks.com/sites/default/files/inline-images/snowflake-schema-120723_0.png">

### Snowflake schemas vs. star schemas

Like star schemas, snowflake schemas have a central fact table which is connected to multiple dimension tables via foreign keys. However, the main difference is that they are more normalized than star schemas.

Snowflake schemas offer more storage efficiency, due to their tighter adherence to high normalization standards, but query performance is not as good as with more denormalized data models. Denormalized data models like star schemas have more data redundancy (duplication of data), which makes query performance faster at the cost of duplicated data.

### Benefits of snowflake schemas

1. Fast data retrieval
2. Enforces data quality
3. Simple, common data model for data warehousing

### Drawbacks of snowflake schemas

1. Lots of overhead upon initial setup
2. Rigid data model
3. High maintenance costs


Certainly! Below is a tabular comparison between Snowflake Schema and Star Schema:

| Feature                      | Snowflake Schema                                                             | Star Schema                                                                  |
| ---------------------------- | ---------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
| **Normalization**            | More normalized, with dimension tables further split into sub-dimensions.    | Denormalized, with fewer levels of hierarchy and less normalization.         |
| **Table Structure**          | Complex structure with multiple levels of related tables.                    | Simplified structure with a central fact table and dimension tables.         |
| **Join Complexity**          | Higher join complexity due to more tables and relationships.                 | Lower join complexity, typically involving fewer tables.                     |
| **Query Performance**        | May result in slightly slower query performance due to more joins.           | Generally faster query performance, especially for certain types of queries. |
| **Storage Space**            | May require more storage space due to normalization.                         | Often requires less storage space due to denormalization.                    |
| **Ease of Maintenance**      | May be more complex to maintain due to the multiple levels of relationships. | Simpler to maintain, as there are fewer tables and relationships.            |
| **Flexibility in Reporting** | Offers flexibility in reporting due to detailed sub-dimensions.              | Provides simplicity for reporting, especially for common queries.            |
| **Scalability**              | May be less scalable due to more tables and joins.                           | Generally more scalable, especially for read-heavy workloads.                |
| **Common Use Case**          | Well-suited for complex business scenarios requiring detailed analysis.      | Commonly used for data warehousing and business intelligence applications.   |
| **Examples**                 | Healthcare systems, financial systems with complex hierarchies.              | Retail sales reporting, customer relationship management (CRM).              |

This table provides a high-level comparison between the Snowflake Schema and Star Schema, two common schema designs used in data warehousing. The choice between them depends on the specific requirements of the data warehouse and the nature of the analytical queries that need to be supported.


# Database Normalization forms

Normalization is a process in database design that helps organize data in a relational database to reduce redundancy, improve data integrity, and enhance data management efficiency. The normalization process is defined by a series of normal forms, each addressing specific issues related to data organization. The most commonly discussed normalization forms are the first normal form (1NF), second normal form (2NF), third normal form (3NF), Boyce-Codd normal form (BCNF), and fourth normal form (4NF). Let's explore each in detail:

### 1. First Normal Form (1NF):

**Definition:** A relation is in 1NF if it contains only atomic values, and there are no repeating groups or arrays.

**Requirements:**

- All attributes must have atomic (indivisible) values.
- No repeating groups or arrays are allowed.

**Example:**

```
StudentID | Courses
-------------------
101       | Math, Physics
102       | Chemistry
```

Converting to 1NF:

```
StudentID | Course
-----------------
101       | Math
101       | Physics
102       | Chemistry
```

### 2. Second Normal Form (2NF):

**Definition:** A relation is in 2NF if it is in 1NF and all non-key attributes are fully functionally dependent on the entire primary key.

**Requirements:**

- Be in 1NF.
- All non-key attributes should be fully functionally dependent on the primary key.

**Example:**

```
OrderID | ProductID | Quantity | Price
-------------------------------------
1       | A         | 10       | $5
1       | B         | 5        | $10
2       | A         | 2        | $5
```

Converting to 2NF:

```
OrderID | ProductID | Quantity
-------------------------------
1       | A         | 10
1       | B         | 5
2       | A         | 2

ProductID | Price
-----------------
A         | $5
B         | $10
```

### 3. Third Normal Form (3NF):

**Definition:** A relation is in 3NF if it is in 2NF and all transitive dependencies are removed.

**Requirements:**

- Be in 2NF.
- Eliminate transitive dependencies.

**Example:**

```
EmployeeID | Department | Location
----------------------------------
101        | HR         | Building1
102        | IT         | Building2
```

Converting to 3NF:

```
EmployeeID | Department
------------------------
101        | HR
102        | IT

Department | Location
---------------------
HR         | Building1
IT         | Building2
```

### 4. Boyce-Codd Normal Form (BCNF):

**Definition:** A relation is in BCNF if it is in 3NF, and for every non-trivial functional dependency, the determinant is a superkey.

**Requirements:**

- Be in 3NF.
- Every non-trivial functional dependency has a superkey as its determinant.

### 5. Fourth Normal Form (4NF):

**Definition:** A relation is in 4NF if it is in BCNF and multi-valued dependencies are eliminated.

**Requirements:**

- Be in BCNF.
- Eliminate multi-valued dependencies.

These normal forms guide database designers in structuring data to reduce redundancy and maintain data integrity. The choice of the normalization level depends on the specific requirements of the application and the nature of the data being stored.
