In [18]:
import pyspark
import os
import json
import argparse

from dotenv import load_dotenv
from pathlib import Path
from pyspark.sql.types import StructType
from pyspark.sql.functions import to_timestamp,col,when
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [19]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read Metadata and Define Schema") \
    .getOrCreate()

In [20]:
# Define metadata
metadata = {
    "fields": [
        {
            "name": "SaleID",
            "nullable": True,
            "type": "integer"
        },
        {
            "name": "Contact",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "Sex",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "Age",
            "nullable": True,
            "type": "integer"
        },
        {
            "name": "State",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "ProductID",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "ProductType",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "SalePrice",
            "nullable": True,
            "type": "float"
        },
        {
            "name": "Profit",
            "nullable": True,
            "type": "float"
        },
        {
            "name": "Lead",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "Month",
            "nullable": True,
            "type": "string"
        },
        {
            "name": "Year",
            "nullable": True,
            "type": "integer"
        }
    ],
    "type": "struct"
}

In [21]:
# Function to convert metadata to Spark StructType
def convert_metadata_to_schema(metadata):
    fields = []
    for field in metadata['fields']:
        if field['type'] == 'integer':
            data_type = IntegerType()
        elif field['type'] == 'string':
            data_type = StringType()
        elif field['type'] == 'float':
            data_type = FloatType()
        else:
            raise ValueError(f"Unsupported type: {field['type']}")
        
        fields.append(StructField(field['name'], data_type, field['nullable']))
    
    return StructType(fields)

In [22]:
# Convert metadata to schema
schema = convert_metadata_to_schema(metadata)

In [28]:
# Sample data path (replace with your actual data path)
data_path = "data\computer_sales.csv"

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
import os

# Check working directory
print("Current working directory:", os.getcwd())

# File path
data_path = r"C:\Users\Indah Permata\OneDrive\Documents\BOOTHCAMP DE\dibimbing_spark_airflow\data\computer_sales.csv"

# Check if file exists
if not os.path.exists(data_path):
    raise FileNotFoundError(f"File not found: {data_path}")
else:
    print(f"File found: {data_path}")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Read Metadata and Define Schema") \
    .getOrCreate()

# Define schema
schema = StructType([
    StructField("SaleID", IntegerType(), True),
    StructField("Contact", StringType(), True),
    StructField("Sex", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("State", StringType(), True),
    StructField("ProductID", StringType(), True),
    StructField("ProductType", StringType(), True),
    StructField("SalePrice", FloatType(), True),
    StructField("Profit", FloatType(), True),
    StructField("Lead", StringType(), True),
    StructField("Month", StringType(), True),
    StructField("Year", IntegerType(), True)
])

# Read data into DataFrame using schema
try:
    df = spark.read.csv(data_path, schema=schema, header=True)
    df.show()
except Exception as e:
    print(f"An error occurred: {e}")

# If needed, select customer-related columns
# df_customers = df.select("Contact", "Sex", "Age", "State").distinct()

# Show the DataFrame
# df_customers.show()

# Define PostgreSQL connection properties
# pg_url = "jdbc:postgresql://<host>:<port>/<database>"
# pg_properties = {
#     "user": "<username>",
#     "password": "<password>",
#     "driver": "org.postgresql.Driver"
# }

# Write the DataFrame to PostgreSQL
# df_customers.write.jdbc(url=pg_url, table="customers", mode="overwrite", properties=pg_properties)

# Stop the Spark session
spark.stop()


Current working directory: /home/jovyan


FileNotFoundError: File not found: C:\Users\Indah Permata\OneDrive\Documents\BOOTHCAMP DE\dibimbing_spark_airflow\data\computer_sales.csv

In [None]:
# Select customer-related columns
df_customers = df.select("Contact", "Sex", "Age", "State").distinct()


In [None]:
# Show the DataFrame
df_customers.show()

In [None]:
jdbc_url = f'jdbc:postgresql://{postgres_host}/{postgres_dw_db}'
jdbc_properties = {
    'user': postgres_user,
    'password': postgres_password,
    'driver': 'org.postgresql.Driver',
    'stringtype': 'unspecified'
}

In [None]:
# Write the DataFrame to PostgreSQL
df_customers.write.jdbc(url=pg_url, table="customers", mode="overwrite", properties=pg_properties)


In [None]:
# Stop the Spark session
spark.stop()