### Step 1: Download the Dataset from Kaggle

In [0]:
!pip install kaggle

In [0]:
dbutils.fs.cp("file:/Workspace/Users/jake.stout@stoutdigitalsolutions.com/kaggle.json", "dbfs:/tmp/kaggle.json")

### Step 2: Set Up the Kaggle API in Databricks

In [0]:
import os
import shutil

# Define DBFS path where kaggle.json was uploaded
kaggle_json_path = "/dbfs/tmp/kaggle.json"

# Create the .kaggle directory in the user's home directory
os.makedirs(os.path.expanduser("~/.kaggle/"), exist_ok=True)

# Copy kaggle.json to the correct location
shutil.copy(kaggle_json_path, os.path.expanduser("~/.kaggle/kaggle.json"))

# Set permissions
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 600)


### Step 3: Install and Test the Kaggle API

In [0]:
!pip install kaggle
!kaggle datasets list

### Step 4: Download the SAP Dataset

In [0]:
!kaggle datasets download -d mustafakeser4/sap-dataset-bigquery-dataset -p /dbfs/tmp --unzip

### Step 5: Load Dataset into Databricks

In [0]:
from pyspark.sql import SparkSession
import os

# Define source path where Kaggle files are stored
source_path = "dbfs:/tmp/"

# Define target database
database_name = "Kaggle_SAP_Replicated_Data"

# Create the database without specifying a location (Unity Catalog will manage it)
spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")

# List all files in the directory
files = dbutils.fs.ls(source_path)

# Process each file
for file in files:
    file_path = file.path
    file_name = os.path.basename(file_path)
    table_name = file_name.replace(".", "_")  # Make table names SQL-friendly
    
    # Skip non-data files
    if file_name.endswith(".csv"):
        # Read CSV file
        df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(file_path)
    elif file_name.endswith(".parquet"):
        # Read Parquet file
        df = spark.read.parquet(file_path)
    else:
        print(f"Skipping {file_name} (unsupported format)")
        continue  # Skip non-data files
    
    # Write DataFrame to Delta table (Unity Catalog managed location)
    df.write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{table_name}")

    print(f"Loaded {file_name} into {database_name}.{table_name}")

print("All files have been processed and stored in the database!")


In [0]:
%sql
SELECT COUNT(*) FROM test_workspace.kaggle_sap_replicated_data.bseg_csv l

In [0]:
%sql
SELECT COUNT(*) FROM test_workspace.kaggle_sap_replicated_data.bseg_csv l
JOIN test_workspace.kaggle_sap_replicated_data.bkpf_csv h ON l.mandt=h.mandt AND l.bukrs=h.bukrs AND l.belnr=h.belnr AND l.gjahr=h.gjahr AND h.operation_flag = l.operation_flag

In [0]:
%sql
SELECT b.*
FROM test_workspace.kaggle_sap_replicated_data.bseg_csv b
LEFT JOIN test_workspace.kaggle_sap_replicated_data.bkpf_csv h 
ON b.mandt = h.mandt 
AND b.bukrs = h.bukrs 
AND b.belnr = h.belnr 
AND b.gjahr = h.gjahr
WHERE h.belnr IS NULL;