In [1]:
import duckdb
import pandas as pd
from IPython.display import display

# Koneksi ke database (bisa juga in-memory dengan ':memory:')
conn = duckdb.connect("../data/database/test_warehouse.db")

# Menjalankan query dan mengambil data sebagai DataFrame
query = """SELECT 
    fc.fact_id, 
    fc.date_id, 
    dd.week_ending, 
    fc.provider_id, 
    dp.federal_provider_number, 
    dp.provider_name, 
    dp.provider_phone_number, 
    fc.location_id, 
    dl.provider_address, 
    dl.provider_city, 
    dl.provider_state, 
    dl.provider_zip_code, 
    dl.county, 
    fc.staff_id, 
    ds.staff_weekly_confirmed_covid_19, 
    ds.staff_total_confirmed_covid_19, 
    fc.bed_capacity_id, 
    dbc.number_of_all_beds, 
    dbc.total_number_of_occupied_beds, 
    fc.residents_weekly_confirmed_covid_19, 
    fc.residents_total_confirmed_covid_19, 
    fc.residents_weekly_all_deaths, 
    fc.residents_total_all_deaths, 
    fc.residents_weekly_covid_19_deaths, 
    fc.residents_total_covid_19_deaths, 
    fc.staff_weekly_confirmed_covid_19, 
    fc.staff_total_confirmed_covid_19, 
    fc.submitted_data, 
    fc.passed_quality_assurance_check
FROM fact_covid_cases fc
LEFT JOIN dim_date dd ON fc.date_id = dd.date_id
LEFT JOIN dim_provider dp ON fc.provider_id = dp.provider_id
LEFT JOIN dim_location dl ON fc.location_id = dl.location_id
LEFT JOIN dim_staff ds ON fc.staff_id = ds.staff_id
LEFT JOIN dim_bed_capacity dbc ON fc.bed_capacity_id = dbc.bed_capacity_id"""
df = conn.execute(query).fetchdf()

# Menampilkan hasil
display(df.head())

Unnamed: 0,fact_id,date_id,week_ending,provider_id,federal_provider_number,provider_name,provider_phone_number,location_id,provider_address,provider_city,...,residents_weekly_confirmed_covid_19,residents_total_confirmed_covid_19,residents_weekly_all_deaths,residents_total_all_deaths,residents_weekly_covid_19_deaths,residents_total_covid_19_deaths,staff_weekly_confirmed_covid_19_1,staff_total_confirmed_covid_19_1,submitted_data,passed_quality_assurance_check
0,1,1,2020-05-24,1,15009,"BURNS NURSING HOME, INC.",2563324110,1,701 MONROE STREET NW,RUSSELLVILLE,...,45,45,4,4,4,4,33,33,Y,Y
1,2,2,2020-05-31,1,15009,"BURNS NURSING HOME, INC.",2563324110,1,701 MONROE STREET NW,RUSSELLVILLE,...,0,45,0,4,0,4,2,35,Y,Y
2,3,3,2020-06-07,1,15009,"BURNS NURSING HOME, INC.",2563324110,1,701 MONROE STREET NW,RUSSELLVILLE,...,0,45,0,4,0,4,0,35,Y,Y
3,4,4,2020-06-14,1,15009,"BURNS NURSING HOME, INC.",2563324110,1,701 MONROE STREET NW,RUSSELLVILLE,...,0,45,0,4,0,4,0,35,Y,Y
4,5,5,2020-06-21,1,15009,"BURNS NURSING HOME, INC.",2563324110,1,701 MONROE STREET NW,RUSSELLVILLE,...,0,45,0,4,0,4,0,35,Y,Y


In [2]:
print(f"Jumlah baris: {df.shape[0]}")  # Total baris
print(f"Jumlah kolom: {df.shape[1]}")  # Total kolom

Jumlah baris: 1000
Jumlah kolom: 29


In [3]:
missing_values = df.isnull().sum()
display(missing_values[missing_values > 0])  # Hanya tampilkan kolom yang memiliki missing values

Series([], dtype: int64)

In [4]:
display(df.dtypes)  # Cek tipe data tiap kolom

fact_id                                         int32
date_id                                         int32
week_ending                            datetime64[us]
provider_id                                     int32
federal_provider_number                        object
provider_name                                  object
provider_phone_number                          object
location_id                                     int32
provider_address                               object
provider_city                                  object
provider_state                                 object
provider_zip_code                              object
county                                         object
staff_id                                        int32
staff_weekly_confirmed_covid_19                object
staff_total_confirmed_covid_19                 object
bed_capacity_id                                 int32
number_of_all_beds                             object
total_number_of_occupied_bed

In [5]:
display(df.describe(include='all'))  

Unnamed: 0,fact_id,date_id,week_ending,provider_id,federal_provider_number,provider_name,provider_phone_number,location_id,provider_address,provider_city,...,residents_weekly_confirmed_covid_19,residents_total_confirmed_covid_19,residents_weekly_all_deaths,residents_total_all_deaths,residents_weekly_covid_19_deaths,residents_total_covid_19_deaths,staff_weekly_confirmed_covid_19_1,staff_total_confirmed_covid_19_1,submitted_data,passed_quality_assurance_check
count,1000.0,1000.0,1000,1000.0,1000.0,1000,1000.0,1000.0,1000,1000,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000,1000
unique,,,,,5.0,5,5.0,,5,5,...,16.0,87.0,6.0,64.0,5.0,6.0,14.0,150.0,2,2
top,,,,,15009.0,"BURNS NURSING HOME, INC.",2563324110.0,,701 MONROE STREET NW,RUSSELLVILLE,...,0.0,48.0,0.0,5.0,0.0,2.0,0.0,70.0,Y,Y
freq,,,,,242.0,242,242.0,,242,242,...,884.0,61.0,854.0,215.0,994.0,395.0,764.0,57.0,978,978
mean,500.5,118.14,2022-08-21 23:31:12,2.58,,,,2.58,,,...,,,,,,,,,,
min,1.0,1.0,2020-05-24 00:00:00,1.0,,,,1.0,,,...,,,,,,,,,,
25%,250.75,55.0,2021-06-06 00:00:00,2.0,,,,2.0,,,...,,,,,,,,,,
50%,500.5,117.5,2022-08-17 12:00:00,3.0,,,,3.0,,,...,,,,,,,,,,
75%,750.25,180.0,2023-10-29 00:00:00,4.0,,,,4.0,,,...,,,,,,,,,,
max,1000.0,242.0,2025-01-05 00:00:00,5.0,,,,5.0,,,...,,,,,,,,,,
