In [2]:
#!pip install mltoolkit-laht
from mltoolkit_laht.data_loading import CSVLoader, load_banking_reviews_data
import numpy as np
import pandas as pd

2024-06-02 17:26:25 - ml_toolkit - INFO - __init__ - <module> - ml_toolkit package initialized.


In [None]:
# Loading data from a file
# data_loader = CSVLoader(file_path="BankReviews.csv")

# Loading data from library datasets
data_loader = CSVLoader(data=load_banking_reviews_data())
BankReviews = data_loader.load_data(encoding="latin1")
org_schema = data_loader.get_schema()  #  to get the schema of the data

In [None]:
# Show dataframe resume: duplicates, missing values, data types, etc.
data_loader.show_info()

### Update schema


In [None]:
BankReviews["Date"] = pd.to_datetime(BankReviews["Date"], format="%d-%m-%Y")
BankReviews["Year"] = BankReviews["Date"].dt.year
BankReviews["text_length"] = BankReviews.Reviews.apply(lambda x: len(x))

random_vector = np.random.rand(len(BankReviews))
BankReviews["new_column"] = BankReviews["text_length"] * random_vector

new_schema = {
    "Date": "datetime64[ns]",
    "Stars": "category",
    "Reviews": "string",
    "BankName": "category",
    "Year": "category",
    "text_length": "int64",
    "new_column": "float64",
}

BankReviews = data_loader.update_schema(new_schema=new_schema)
data_loader.show_info(num_rows=100)

### Features visualization


In [None]:
from mltoolkit_laht.data_visualization import FeaturesVisualizer

features_visualizer = FeaturesVisualizer(
    figsize=(12, 8), title="Distribution Plots", x_label="Values", y_label="Frequency"
)
features_visualizer.plot_features(
    data=BankReviews,  # Your DataFrame or Series
    show_kde=True,  # Show KDE plot for numerical variables
    combine_plots=True,  # Combine all plots of the same type into a single graph
    standardize=False,  # Standardize numerical variables (only used when combine_plots is True)
    rotate_xticks=90,  # Rotate x-ticks by 45 degrees for category plots
    extend_numerical_plots=False,  # Extend numerical plots to show histogram, box plot, and violin plot horizontally
    features_to_combine=[
        "text_length",
        "new_column",
    ],  # List of features to combine in a single plot
)