In [None]:
from pydbsmgr import *
from pydbsmgr.utils.tools import *
import pandas as pd

## Example of cleaning a dataset

In [None]:
# https://www.kaggle.com/datasets/amaanansari09/most-streamed-songs-all-time
features_df = pd.read_csv("./data/Features.csv", encoding="utf-8")
streams_df = pd.read_csv("./data/Streams.csv", encoding="utf-8")

list_of_df = [features_df, streams_df]
list_of_names = ["Features", "Streams"]

In [None]:
j = 0
df_sheet_files_info = pd.DataFrame()

for df in list_of_df:
    info, df = check_values(df, df_name=list_of_names[j], mode=False)
    df_sheet_files_info = pd.concat([df_sheet_files_info, info])
    j += 1
df_sheet_files_info.to_html(f"report-health-checker.html", index=False, encoding="utf-8")

In [None]:
# Another way to do the same but with fewer lines of code is as follows
dfs_, report = check_for_list(list_of_df, list_of_names, mode=False)

## ETL example using Azure Cloud 

In [None]:
from datetime import date

from pydbsmgr.logs import *
from pydbsmgr.utils.azure_sdk import *

In [None]:
connection_string = get_connection_string()
database_name = "test_database"
logsbook_name = "test_logsbook"
current_datetime = date.today()
container_name = "raw"

In [None]:
# For this example, assume you’re using a dictionary to pass the data
data = {
    "name": ["Xavier", "Ann", "Jana", "Yi", "Robin", "Amal", "Nori"],
    "city": ["Mexico City", "Toronto", "Prague", "Shanghai", "Manchester", "Cairo", "Osaka"],
    "age": [41, 28, 33, 34, 38, 31, 37],
    "py-score": [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0],
}

# Now you’re ready to create a pandas DataFrame
df = pd.DataFrame(data)

controller = StorageController(connection_string, container_name)
logbook = EventLogBook(logsbook_name, "./")

logbook_data = pd.DataFrame(
    {
        "container": [container_name],
        "name": [database_name],
        "datetime": [current_datetime],
    }
)

In [None]:
%%capture
controller.upload_parquet("/", [df], [database_name])
controller.upload_parquet("/", [df], [database_name], compression=False)
BlobList = controller.get_BlobList("/")
controller._print_BlobPrefix()

del BlobList[0]

controller.set_BlobPrefix(BlobList)
controller._print_BlobPrefix()

dfs, names = controller.get_parquet("/", "\w+.parquet", True)

logbook.create(logbook_data)