In [None]:
!pip install -U awswrangler

In [2]:
import pandas as pd
import awswrangler as wr
from generate_files import generate_metadata_file
from ingest_data import ingest_data_files
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as csv
import re
import os
import boto3
import glob
import sys
import threading
import json

In [2]:
#ppa = pq.read_table("../test/test_data/yellow_trip_data_part1.parquet")
#ppa = pq.read_table("../test/test_data/yellow_trip_data_part2.parquet")
ppa = csv.read_csv("../test/test_data/melbourne_house_prices.csv")
ppa.schema

Suburb: string
Address: string
Rooms: int64
Type: string
Price: int64
Method: string
SellerG: string
Date: string
Postcode: int64
Regionname: string
Propertycount: int64
Distance: double
CouncilArea: string

### Normalize columns
Normalize all columns names to be compatible with Amazon Athena.

In [3]:
df_raw = ppa.to_pandas()
df = wr.catalog.sanitize_dataframe_columns_names(df_raw, handle_duplicate_columns="rename")
ppa = pa.Table.from_pandas(df)
df.head()

Unnamed: 0,suburb,address,rooms,type,price,method,sellerg,date,postcode,regionname,propertycount,distance,councilarea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


### Save the database

In [4]:
#pq.write_table(ppa, "data/file001.parquet", compression="SNAPPY")
df.to_csv("data/file001.csv", index=False)

### Metadata file

In [5]:
json_file = "ingest_parameters_cc.json"

try:
    with open(json_file, 'r') as f:
        param_json = json.load(f)
except FileNotFoundError:
    print(f"File {json_file} not found.")
else:
    database = param_json["database"]
    table_name = param_json["table_name"]
    description = param_json["description"]
    partition_cols = param_json.get("partition_cols")
    required_col = param_json.get("required_col")
    doc_string = param_json.get("doc_string")

In [6]:
print(f"Table name: {table_name}")
print(f"Database: {database}")
print(f"Table description: {description}")
print(f"Table partition columns: {partition_cols}")
print(f"Table required columns: {required_col}")
print(f"Table doc string: {doc_string}")


Table name: houses_data
Database: melbourne
Table description: Melbourne house data prices.
Table partition columns: []
Table required columns: {}
Table doc string: {'suburb': 'The suburb where the property is located.', 'address': 'The specific address of the property.', 'rooms': 'The number of rooms in the property.', 'type': 'The type of property (e.g., house, unit).', 'price': 'The selling price of the property.', 'method': 'The method of sale (e.g., auction, private sale).', 'sellerg': 'The name of the seller’s agent or agency.', 'date': 'The date the property was sold.', 'postcode': 'The postal code of the property’s location.', 'regionname': 'The broader region where the property is situated.', 'propertycount': 'The number of properties in the suburb.', 'distance': 'The distance from the property to the central business district (CBD).', 'councilarea': 'The local government area responsible for the property.'}


In [7]:
metadata = generate_metadata_file(
    ppa=ppa,
    table_name=table_name,
    database=database,
    required_col=required_col,
    partition_cols=partition_cols,
    doc_string=doc_string,
    description=description
)

In [8]:
with open("data/metadata.json", 'w', encoding='utf-8') as f:
    json.dump(metadata, f, indent=4)

### Ingestion data

In [6]:
try:
    with open(json_file, 'r') as f:
        data_file = json.load(f)
except FileNotFoundError:
    print(f"File {json_file} not found.")
    sys.exit(1)
else:
    metadata_s3 = data_file["metadata_s3"]
    print(f"Metadata S3: {metadata_s3}")

Metadata S3: {'datalake': 'iceberg-datalake'}


In [7]:
ingest_data_files(metadata_s3)

Found bucket: raw-datalake-iceberg-2f88fdbce7c85

Upload metadata file...
data/metadata.json  3882 / 3882.0  (100.00%)

Upload data files...
./data\file001.csv  7408944 / 7408944.0  (100.00%)File ./data\file001.csv uploaded successfully.
