In [1]:
from pathlib import Path

import pandas as pd
from pmlb import fetch_data

from howso.engine import (
    load_trainee,
    Trainee,
)
from howso.utilities import infer_feature_attributes
from howso.synthesizer import Synthesizer

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import time
from howso.validator import Validator

In [2]:
import pandas as pd

# Function to process datetime columns and drop columns with all same values
def split_and_clean_datetime_columns(df):
    # Loop through all columns
    for col in df.columns:
        # Check if the column is of datetime type or can be converted to datetime
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            # Ensure column is in datetime format
            df[col] = pd.to_datetime(df[col], errors='coerce')

            # Extract datetime components into new columns
            df[f'{col}_day'] = df[col].dt.day
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_day_of_week'] = df[col].dt.dayofweek
            df[f'{col}_hour'] = df[col].dt.hour
            df[f'{col}_minute'] = df[col].dt.minute
            df[f'{col}_second'] = df[col].dt.second
            df[f'{col}_quarter'] = df[col].dt.quarter

            # Drop the original datetime column
            df.drop(columns=[col], inplace=True)

    df = df.loc[:, (df != 0).any(axis=0)]

    return df

In [3]:
n_samples = 10000
datetime_feature = pd.date_range(start='2023-01-01', periods=n_samples, freq='D')
datetime_numeric = datetime_feature.map(pd.Timestamp.toordinal).values
target_perfect = datetime_numeric * 0.5  # A linear transformation of datetime
noise = np.random.normal(0, 0.5 * np.std(target_perfect), n_samples)  # Add 20% random noise
target = target_perfect + noise

# 6. Create a DataFrame to hold the generated data
df = pd.DataFrame({
    'datetime_feature': datetime_feature,
    'target': target
})

df['Random_0'] = np.random.normal(loc=0, scale=1, size=n_samples)
df['Random_1'] = np.random.normal(loc=0, scale=1, size=n_samples)
df['Random_2'] = np.random.uniform(low=0, high=10, size=n_samples)
df['Random_3'] = np.random.exponential(scale=1, size=n_samples)
df['Random_4'] = np.random.binomial(10, 0.5, size=n_samples)

df_split = split_and_clean_datetime_columns(df)
features_split = infer_feature_attributes(df_split)


In [8]:
t = Trainee(features=features_split)
t.train(df_split)
t.analyze()

In [9]:
t.get_cases()



Unnamed: 0,Random_0,Random_1,Random_2,Random_3,Random_4,datetime_feature_day,datetime_feature_day_of_week,datetime_feature_month,datetime_feature_quarter,datetime_feature_year,target
0,-0.083234,0.314658,0.230108,0.405624,7,1,6,1,1,2023,370244.698393
1,-1.306337,-0.066307,5.394431,0.050064,6,2,0,1,1,2023,368301.471210
2,-0.701051,-0.037267,2.971339,0.704069,6,3,1,1,1,2023,369435.569390
3,-0.375996,0.373927,1.465240,1.203467,6,4,2,1,1,2023,368538.178333
4,-0.531001,-0.285972,2.342781,0.150736,8,5,3,1,1,2023,369134.147034
...,...,...,...,...,...,...,...,...,...,...,...
9995,-1.081211,1.087545,8.981984,1.887473,3,14,5,5,2,2050,373772.133870
9996,-0.834399,0.501289,1.553702,0.811890,4,15,6,5,2,2050,373329.386079
9997,0.978747,1.268061,2.357988,0.891024,5,16,0,5,2,2050,373689.415567
9998,0.223244,0.965593,4.048433,0.557829,6,17,1,5,2,2050,375018.230567


In [5]:
for f_name in features_split.get_names(types=("nominal", "ordinal")):
    features_split[f_name]["non_sensitive"] = True

with Synthesizer() as s:
    s.train(df_split, features_split)
    basic_synth = s.synthesize_cases(n_samples=len(df_split), generate_new_cases="always")
    display(basic_synth)

    with Validator(s.trainee, basic_synth) as val:
        split_results = val.run_metrics()

The following parameters from your configuration will override the default Amalgam parameters: {'trace'}


Unnamed: 0,target,Random_0,Random_1,Random_2,Random_3,Random_4,datetime_feature_day,datetime_feature_month,datetime_feature_year,datetime_feature_day_of_week,datetime_feature_quarter
0,372342.536058,-1.202801,1.375931,6.644968,0.310068,6,17,6,2025,1,2
1,372922.811131,-2.687333,0.336423,4.959861,2.294680,4,1,4,2036,3,2
2,371842.033113,0.149675,-1.144626,4.067221,2.675365,7,18,7,2043,2,3
3,368951.812493,-1.410331,0.005785,6.775676,1.810205,3,15,7,2023,0,3
4,373537.119285,0.118652,0.260586,6.690273,2.344092,3,21,8,2048,4,3
...,...,...,...,...,...,...,...,...,...,...,...
9995,370521.314100,0.342146,-0.229798,9.248147,0.338629,5,5,5,2036,6,2
9996,373605.919414,-0.505344,-1.645385,9.509926,0.356972,0,13,7,2048,2,3
9997,373179.203254,-0.165820,-0.287679,0.858104,0.163768,6,15,12,2046,4,4
9998,372469.925038,1.008776,1.573786,0.672413,0.869694,6,8,1,2043,5,1


The following parameters from your configuration will override the default Amalgam parameters: {'trace'}


Validator Enterprise is enabled. Running without parallelism.
AnonymityPreservation       : Beginning run
AnonymityPreservation       : Run completed with desirability=5.0
DescriptiveStatistics       : Beginning run
DescriptiveStatistics       : Run completed with desirability=4.058
JointProbability            : Beginning run
JointProbability            : Run completed with desirability=4.569
ModelComparison             : Beginning run
ModelComparison             : Run completed with desirability=2.173
Run finished with overall_desirability=3.767
