# Creating Delta Table

##Method 1: Pyspark

### Using Create Keyword

In [0]:
from delta.tables import *
DeltaTable.create(spark) \
.tableName("employee_demo") \
.addColumn("emp_id", "INT") \
.addColumn("emp_name", "STRING") \
.addColumn("gender", "STRING") \
.addColumn("salary", "INT") \
.addColumn("Dept", "STRING") \
.property("description", "table created for demo purpose") \
.location("/FileStore/tables/delta/createtable") \
.execute()

Out[1]: <delta.tables.DeltaTable at 0x7f4984095970>

### Using CreateIfNotExists Keyword

In [0]:
from delta.tables import *
DeltaTable.createIfNotExists(spark) \
.tableName("employee_demo") \
.addColumn("emp_id", "INT") \
.addColumn("emp_name", "STRING") \
.addColumn("gender", "STRING") \
.addColumn("salary", "INT") \
.addColumn("Dept", "STRING") \
.property("description", "table created for demo purpose") \
.location("/FileStore/tables/delta/createtable") \
.execute()

Out[2]: <delta.tables.DeltaTable at 0x7f4984762550>

### Using CreateorReplace Keyword

In [0]:
from delta.tables import *
DeltaTable.createOrReplace(spark) \
.tableName("employee_demo") \
.addColumn("emp_id", "INT") \
.addColumn("emp_name", "STRING") \
.addColumn("gender", "STRING") \
.addColumn("salary", "INT") \
.addColumn("Dept", "STRING") \
.property("description", "table created for demo purpose") \
.location("/FileStore/tables/delta/createtable") \
.execute()

Out[3]: <delta.tables.DeltaTable at 0x7f49849fe490>

## Method 2: SQL

In [0]:
%sql
CREATE TABLE employee_demo (
    emp_id INT,
    emp_Name STRING,
    gender STRING,
    salary INT,
    dept STRING
)
USING DELTA

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2146631037116136>:7[0m
[1;32m      5[0m     display(df)
[1;32m      6[0m     [38;5;28;01mreturn[39;00m df
[0;32m----> 7[0m   _sqldf [38;5;241m=[39m [43m____databricks_percent_sql[49m[43m([49m[43m)[49m
[1;32m      8[0m [38;5;28;01mfinally[39;00m:
[1;32m      9[0m   [38;5;28;01mdel[39;00m ____databricks_percent_sql

File [0;32m<command-2146631037116136>:4[0m, in [0;36m____databricks_percent_sql[0;34m()[0m
[1;32m      2[0m [38;5;28;01mdef[39;00m [38;5;21m____databricks_percent_sql[39m():
[1;32m      3[0m   [38;5;28;01mimport[39;00m [38;5;21;01mbase64[39;00m
[0;32m----> 4[0m   df [38;5;241m=[39m [43mspark[49m[38;5;241;43m.[39;49m[43msql[49m[43m([49m[43mbase64[49m[38;5;241;43m.[39;49m[43mstandard_b64decode[49m[43m([49m[38;5;124;43m"[39;

### Using IF NOT EXIST Keyword

In [0]:
%sql
CREATE TABLE IF NOT EXISTS employee_demo (
    emp_id INT,
    emp_Name STRING,
    gender STRING,
    salary INT,
    dept STRING
)
USING DELTA

## Method 3: DataFrame 

In [0]:
# Importing required module
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Employee Data").getOrCreate()

# Data for the DataFrame
employee_data = [
    (100, "Stephen", "M", 2000, "IT"),
    (200, "Philipp", "M", 8000, "HR"),
    (300, "Lara", "F", 6000, "SALES")
]

# Schema for the DataFrame
employee_schema = ["emp_id", "emp_name", "gender", "salary", "dept"]

# Creating the DataFrame
df = spark.createDataFrame(employee_data, schema=employee_schema)

# Displaying the DataFrame
df.show()

+------+--------+------+------+-----+
|emp_id|emp_name|gender|salary| dept|
+------+--------+------+------+-----+
|   100| Stephen|     M|  2000|   IT|
|   200| Philipp|     M|  8000|   HR|
|   300|    Lara|     F|  6000|SALES|
+------+--------+------+------+-----+



In [0]:
# Save the DataFrame as a Delta table in the default database
df.write.format("delta").saveAsTable("default.employee_demo")



# Delta Table Instance

## Approach 1

In [0]:
# Creates a Delta table instance for a given path
from delta.tables import *


deltainstance1 = DeltaTable.forPath(spark, "/FileStore/tables/delta/createtable")

In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Example DeltaTable instance (assuming it's defined)
# deltainstance1 = DeltaTable.forPath(spark, "your/delta/table/path")

# Define a condition for deleting records
condition = F.col("emp_id") == 100  # Using a PySpark column object

# Perform the delete operation with the correct condition
deltainstance1.delete(condition)

In [0]:
display(deltainstance1.toDF())

emp_id,emp_name,gender,salary,Dept
1000,Stephen,M,2000,IT


## Approach 2

In [0]:
from delta.tables import DeltaTable

# Data for creating a Delta table
employee_data = [(100, "Philipp", "M", 8000, "HR")]

# Define a schema (optional, for structured data)
employee_schema = StructType([
    StructField("emp_id", IntegerType(), False),
    StructField("emp_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("dept", StringType(), True)
])

# Create a DataFrame
df = spark.createDataFrame(employee_data, schema=employee_schema)

# Write the DataFrame to a Delta table (overwriting if necessary)
df.write.format("delta").mode("overwrite").save("/path/to/your/delta_table")

# Now create a DeltaTable instance from the specified path
deltainstance2 = DeltaTable.forPath(spark, "/path/to/your/delta_table")

In [0]:
display(deltainstance2.toDF())

emp_id,emp_name,gender,salary,dept
100,Philipp,M,8000,HR


In [0]:
display(deltainstance2.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2024-05-01T07:38:04.000+0000,999926623136504,krishnayogik@gmail.com,WRITE,"Map(mode -> Overwrite, partitionBy -> [])",,List(2146631037116102),0501-072340-z40s2hsy,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 1521)",,Databricks-Runtime/12.2.x-scala2.12


# Delta Table Insert

## SQl Insert


In [0]:
%sql
insert into default.employee_demo values(1000,"Stephen","M",2000,'IT')

num_affected_rows,num_inserted_rows
1,1


In [0]:
%sql 

DESCRIBE DETAIL  default.employee_demo

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
delta,fa8dd35a-3382-48b9-a33a-5c61d01ed9eb,spark_catalog.default.employee_demo,,dbfs:/FileStore/tables/delta/createtable,2024-05-01T07:26:27.277+0000,2024-05-01T07:28:07.000+0000,List(),1,1521,Map(description -> table created for demo purpose),1,2,"List(appendOnly, invariants)",Map()


## Dataframe Insert

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

# Create a Spark session (if not already created)
spark = SparkSession.builder.appName("Employee Data").getOrCreate()

# Data to create the DataFrame
employee_data = [(100, "Philipp", "M", 8000, "HR")]

# Define the schema for the DataFrame
employee_schema = StructType([
    StructField("emp_id", IntegerType(), False),
    StructField("emp_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("dept", StringType(), True)  # Fixing syntax error
])

# Create the DataFrame with the specified schema
df = spark.createDataFrame(employee_data, schema=employee_schema)

# Display the DataFrame (for Databricks environments)
display(df)  # In Databricks
# df.show()  # In non-Databricks environments

emp_id,emp_name,gender,salary,dept
100,Philipp,M,8000,HR


In [0]:
df.write.format("delta").mode ("append").saveAsTable("employee_demo")

## Dataframe INSERT Into Method

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField

# Create a Spark session (if needed)
spark = SparkSession.builder.appName("Employee Data").getOrCreate()

# Data to create the DataFrame
employee_data = [(300, "Lara", "F", 6000, "SALES")]

# Define the schema for the DataFrame
employee_schema = StructType([
    StructField("emp_id", IntegerType(), False),
    StructField("emp_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True),  # Fixed typo
    StructField("dept", StringType(), True)  # Correcting syntax
])

# Create the DataFrame with the specified schema
df1 = spark.createDataFrame(employee_data, schema=employee_schema)

# Display the DataFrame
display(df1)

emp_id,emp_name,gender,salary,dept
300,Lara,F,6000,SALES


In [0]:
# Insert the DataFrame into an existing Delta table without overwriting
df1.write.insertInto("employee_demo", overwrite=False)



# Read a Table

## Python

In [0]:
people_df = spark.read.table("employee_demo")

display(people_df)

emp_id,emp_name,gender,salary,Dept
1000,Stephen,M,2000,IT
100,Philipp,M,8000,HR


## Scala

In [0]:
%scala
val people_df = spark.read.table("employee_demo")
display(people_df)

emp_id,emp_name,gender,salary,Dept
1000,Stephen,M,2000,IT


## SQL

In [0]:
%sql

SELECT * FROM delta.`dbfs:/user/hive/warehouse/employee_demo`;

emp_id,emp_name,gender,salary,dept
100,Stephen,M,2000,IT
200,Philipp,M,8000,HR
300,Lara,F,6000,SALES
1000,Stephen,M,2000,IT


# Write a Table

## Python

In [0]:
# Write the DataFrame to a Delta table
table_path = "/mnt/delta/employee_demo"
df.write.format("delta").mode("overwrite").save(table_path)

In [0]:
%sql 
select * from employee_demo  

emp_id,emp_name,gender,salary,Dept
1000,Stephen,M,2000,IT


In [0]:
df.write.mode("append").saveAsTable("employee")

In [0]:
df.write.mode("overwrite").saveAsTable("EmployeeDemo")

# Delete from a table

## Python

### Creating a Delta Table

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


schema = StructType([
    StructField("id", IntegerType(), True),  
    StructField("name", StringType(), True), 
    StructField("salary", IntegerType(), True) 
])

# Create a DataFrame with the corrected schema
data = [
    (1, "John", 50000),
    (2, "Jane", 60000),
    (3, "Doe", 55000)
]


df = spark.createDataFrame(data, schema=schema)
# Write the DataFrame to a Delta table
delta_path = "/mnt/delta/example_table"  # Path to the Delta table
df.write.format("delta").mode("overwrite").save(delta_path)

In [0]:
# Remove records with id = 2
df_filtered = df.filter(df["id"] != 2)

# Display the updated DataFrame
df_filtered.show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  1|John| 50000|
|  3| Doe| 55000|
+---+----+------+



In [0]:
# Define the Delta path
delta_path = "/mnt/delta/example_table"  # Update with your Delta table path

# Write the DataFrame to the Delta table
df.write.format("delta").mode("overwrite").save(delta_path)

# Create a DeltaTable instance for the specified path
delinstance = DeltaTable.forPath(spark, delta_path)

# Delete records where id is 3
delinstance.delete(col("id") == 3)

# Display table history

## Python

In [0]:
# Create a DeltaTable instance for the specified path
delta_table = DeltaTable.forPath(spark, delta_path)

# Access the history of the Delta table
history = delta_table.history()  # Gets the full history
history.show(truncate=False)  # Display the history in full

+-------+-------------------+---------------+----------------------+---------+--------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp          |userId         |userName              |operation|operationParameters                   |job |notebook          |clusterId           |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                                                                                                                                                                              

## SQL

In [0]:
%sql
SELECT * FROM delta.`/mnt/delta/example_table` VERSION AS OF 0

id,name,salary
1,John,50000
2,Jane,60000
3,Doe,55000


In [0]:
%sql
SELECT * FROM delta.`/mnt/delta/example_table` TIMESTAMP AS OF '2024-05-01 08:31:20'

id,name,salary
1,John,50000
2,Jane,60000


# Optimize a table

In [0]:
%sql
OPTIMIZE delta.`/mnt/delta/example_table`

path,metrics
dbfs:/mnt/delta/example_table,"List(1, 2, List(1043, 1043, 1043.0, 1, 1043), List(1039, 1039, 1039.0, 2, 2078), 0, null, 1, 2, 0, true, 0, 0, 1714552558813, 1714552565581, 8, 1, null, List(0, 0), 3, 3, 542)"


# Z-order by columns

In [0]:
%sql
OPTIMIZE delta.`/mnt/delta/example_table`
ZORDER BY (Name);

path,metrics
dbfs:/mnt/delta/example_table,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 1043), 0, List(0, 0), 0, null), 0, 1, 1, false, 0, 0, 1714552866714, 1714552868332, 8, 0, null, List(0, 0), 3, 3, 0)"


# Clean up snapshots with VACUUM

In [0]:
%sql
VACUUM delta.`/mnt/delta/example_table`;

path
dbfs:/mnt/delta/example_table
