In [1]:
from pathlib import Path

import pandas as pd
from pmlb import fetch_data

from howso.engine import (
    load_trainee,
    Trainee,
)
from howso.utilities import infer_feature_attributes

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import time
from howso.validator import Validator

In [2]:
import pandas as pd

# Function to process datetime columns and drop columns with all same values
def split_and_clean_datetime_columns(df):
    # Loop through all columns
    for col in df.columns:
        # Check if the column is of datetime type or can be converted to datetime
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            # Ensure column is in datetime format
            df[col] = pd.to_datetime(df[col], errors='coerce')

            # Extract datetime components into new columns
            df[f'{col}_day'] = df[col].dt.day
            df[f'{col}_month'] = df[col].dt.month
            df[f'{col}_year'] = df[col].dt.year
            df[f'{col}_day_of_week'] = df[col].dt.dayofweek
            df[f'{col}_hour'] = df[col].dt.hour
            df[f'{col}_minute'] = df[col].dt.minute
            df[f'{col}_second'] = df[col].dt.second
            df[f'{col}_quarter'] = df[col].dt.quarter

            # Drop the original datetime column
            df.drop(columns=[col], inplace=True)

    df = df.loc[:, (df != 0).any(axis=0)]

    return df

In [3]:
regular_datetime_stats = []
regular_datetime_timings_total = []
regular_datetime_timings_train = []
regular_datetime_timings_analyze = []
regular_datetime_timings_react = []
split_datetime_stats = []
split_datetime_timings_total = []
split_datetime_timings_train = []
split_datetime_timings_analyze = []
split_datetime_timings_react = []
for i in range(1):
    n_samples = 10000
    datetime_feature = pd.date_range(start='2023-01-01', periods=n_samples, freq='D')
    datetime_numeric = datetime_feature.map(pd.Timestamp.toordinal).values
    target_perfect = datetime_numeric * 0.5  # A linear transformation of datetime
    noise = np.random.normal(0, 0.5 * np.std(target_perfect), n_samples)  # Add 20% random noise
    target = target_perfect + noise

    # 6. Create a DataFrame to hold the generated data
    df = pd.DataFrame({
        'datetime_feature': datetime_feature,
        'target': target
    })

    df['Random_0'] = np.random.normal(loc=0, scale=1, size=n_samples)
    df['Random_1'] = np.random.normal(loc=0, scale=1, size=n_samples)
    df['Random_2'] = np.random.uniform(low=0, high=10, size=n_samples)
    df['Random_3'] = np.random.exponential(scale=1, size=n_samples)
    df['Random_4'] = np.random.binomial(10, 0.5, size=n_samples)

    df_split = split_and_clean_datetime_columns(df)
    features = infer_feature_attributes(df)
    features_split = infer_feature_attributes(df_split)
    features_split["datetime_feature_day"]["cycle_length"] = 30
    features_split["datetime_feature_day"]["type"] = 'continuous'
    features_split["datetime_feature_month"]["cycle_length"] = 11
    features_split["datetime_feature_month"]["type"] = 'continuous'
    features_split["datetime_feature_day_of_week"]["cycle_length"] = 6
    features_split["datetime_feature_day_of_week"]["type"] = 'continuous'
    features_split["datetime_feature_quarter"]["cycle_length"] = 4
    features_split["datetime_feature_quarter"]["type"] = 'continuous'
    action_features = ['target']
    context_features = features.get_names(without=action_features)
    context_features_split = features_split.get_names(without=action_features)

    t_split = Trainee(features=features_split)

    split_total_start_time = time.time()
    start_time = time.time()
    t_split.train(df_split)
    end_time = time.time()
    duration = end_time - start_time
    split_datetime_timings_train.append(duration)

    start_time = time.time()
    t_split.analyze(context_features=context_features_split, action_features=action_features)
    end_time = time.time()
    duration = end_time - start_time
    split_datetime_timings_analyze.append(duration)

    start_time = time.time()
    stats_split = t_split.react_aggregate(
        action_feature=action_features[0],
        details = {
            "prediction_stats": True,
            "selected_prediction_stats": ["all"]
        }
    )
    end_time = time.time()
    duration = end_time - start_time
    split_total_duration = end_time - split_total_start_time
    split_datetime_timings_react.append(duration)
    split_datetime_timings_total.append(split_total_duration)

    split_datetime_stats.append(stats_split['target'])


    t = Trainee(features=features)

    reg_total_start_time = time.time()
    start_time = time.time()
    t.train(df)
    end_time = time.time()
    duration = end_time - start_time
    regular_datetime_timings_train.append(duration)

    start_time = time.time()
    t.analyze(context_features=context_features, action_features=action_features)
    end_time = time.time()
    duration = end_time - start_time
    regular_datetime_timings_analyze.append(duration)

    start_time = time.time()
    stats = t.react_aggregate(
        action_feature=action_features[0],
        details = {
            "prediction_stats": True,
            "selected_prediction_stats": ["all"]
        }
    )
    end_time = time.time()
    duration = end_time - start_time
    reg_total_duration = end_time - reg_total_start_time
    regular_datetime_timings_total.append(reg_total_duration)
    regular_datetime_timings_react.append(duration)
    regular_datetime_stats.append(stats['target'])



The following parameters from your configuration will override the default Amalgam parameters: {'trace'}


In [13]:
df

Unnamed: 0,target,Random_0,Random_1,Random_2,Random_3,Random_4,datetime_feature_day,datetime_feature_month,datetime_feature_year,datetime_feature_day_of_week,datetime_feature_hour,datetime_feature_minute,datetime_feature_second,datetime_feature_quarter
0,367757.352067,-0.091567,0.286899,5.869872,0.606734,5,1,1,2023,6,0,0,0,1
1,369134.770219,-0.126231,0.527523,2.723673,1.066459,5,2,1,2023,0,0,0,0,1
2,368085.615726,1.400131,-0.252415,8.406839,0.320269,5,3,1,2023,1,0,0,0,1
3,369410.541430,-0.345046,0.673759,6.267236,1.586154,2,4,1,2023,2,0,0,0,1
4,368210.508007,-1.437975,0.229683,1.341575,0.075930,7,5,1,2023,3,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,373684.460042,0.546406,0.949753,9.235508,0.604342,6,14,5,2050,5,0,0,0,2
9996,372868.738168,-0.319489,0.428396,0.376062,0.283163,6,15,5,2050,6,0,0,0,2
9997,374525.798863,1.074603,2.103681,7.648113,1.220723,3,16,5,2050,0,0,0,0,2
9998,372879.025522,0.000650,0.899212,3.868655,0.318234,5,17,5,2050,1,0,0,0,2


In [15]:
t_split.get_cases()



Unnamed: 0,Random_0,Random_1,Random_2,Random_3,Random_4,datetime_feature_day,datetime_feature_day_of_week,datetime_feature_month,datetime_feature_quarter,datetime_feature_year,target
0,-0.091567,0.286899,5.869872,0.606734,5,,,,,2023,367757.352067
1,-0.126231,0.527523,2.723673,1.066459,5,,,,,2023,369134.770219
2,1.400131,-0.252415,8.406839,0.320269,5,,,,,2023,368085.615726
3,-0.345046,0.673759,6.267236,1.586154,2,,,,,2023,369410.541430
4,-1.437975,0.229683,1.341575,0.075930,7,,,,,2023,368210.508007
...,...,...,...,...,...,...,...,...,...,...,...
9995,0.546406,0.949753,9.235508,0.604342,6,,,,,2050,373684.460042
9996,-0.319489,0.428396,0.376062,0.283163,6,,,,,2050,372868.738168
9997,1.074603,2.103681,7.648113,1.220723,3,,,,,2050,374525.798863
9998,0.000650,0.899212,3.868655,0.318234,5,,,,,2050,372879.025522


In [4]:
reg_train_time = sum(regular_datetime_timings_train) / len(regular_datetime_timings_train)
split_train_time = sum(split_datetime_timings_train) / len(split_datetime_timings_train)
print("Regular DateTime Train:")
print(reg_train_time)
print("Split DateTime Train:")
print(split_train_time)

Regular DateTime Train:
0.43015408515930176
Split DateTime Train:
0.26256895065307617


In [5]:
reg_analyze_time = sum(regular_datetime_timings_analyze) / len(regular_datetime_timings_analyze)
split_analyze_time = sum(split_datetime_timings_analyze) / len(split_datetime_timings_analyze)
print("Regular DateTime Analyze:")
print(reg_analyze_time)
print("Split DateTime Analyze:")
print(split_analyze_time)

Regular DateTime Analyze:
24.569087028503418
Split DateTime Analyze:
27.78687596321106


In [6]:
reg_react_time = sum(regular_datetime_timings_react) / len(regular_datetime_timings_react)
split_react_time = sum(split_datetime_timings_react) / len(split_datetime_timings_react)
print("Regular DateTime React:")
print(reg_react_time)
print("Split DateTime React:")
print(split_react_time)

Regular DateTime React:
1.7284278869628906
Split DateTime React:
2.1252570152282715


In [7]:
reg_total_time = sum(regular_datetime_timings_total) / len(regular_datetime_timings_total)
split_total_time = sum(split_datetime_timings_total) / len(split_datetime_timings_total)
print("Regular DateTime Total:")
print(reg_total_time)
print("Split DateTime Total:")
print(split_total_time)

Regular DateTime Total:
26.72767210006714
Split DateTime Total:
30.174705982208252


In [8]:
# Initialize dictionaries to store sums and counts
sum_dict = {key: 0 for key in regular_datetime_stats[0].keys()}
count_dict = {key: 0 for key in regular_datetime_stats[0].keys()}

# Loop through the list of dictionaries
for d in regular_datetime_stats:
    for key, value in d.items():
        if not np.isnan(value):  # Check if the value is not NaN
            sum_dict[key] += value  # Add to the sum
            count_dict[key] += 1    # Increase the count

# Calculate the average for each key
regular_stats = {key: (sum_dict[key] / count_dict[key]) if count_dict[key] > 0 else np.nan for key in sum_dict}
regular_stats

{'mcc': nan,
 'recall': nan,
 'accuracy': nan,
 'mae': 625.1540958689989,
 'precision': nan,
 'confusion_matrix': nan,
 'spearman_coeff': 0.8868636468636468,
 'rmse': 785.2056839692877,
 'r2': 0.769676505577357,
 'smape': 0.1682012970804315,
 'adjusted_smape': 0.16820129707774542}

In [9]:
# Initialize dictionaries to store sums and counts
sum_dict = {key: 0 for key in split_datetime_stats[0].keys()}
count_dict = {key: 0 for key in split_datetime_stats[0].keys()}

# Loop through the list of dictionaries
for d in split_datetime_stats:
    for key, value in d.items():
        if not np.isnan(value):  # Check if the value is not NaN
            sum_dict[key] += value  # Add to the sum
            count_dict[key] += 1    # Increase the count

# Calculate the average for each key
split_stats = {key: (sum_dict[key] / count_dict[key]) if count_dict[key] > 0 else np.nan for key in sum_dict}
split_stats

{'mcc': nan,
 'recall': nan,
 'accuracy': nan,
 'mae': 624.141068657656,
 'precision': nan,
 'confusion_matrix': nan,
 'spearman_coeff': 0.8888361488361488,
 'rmse': 772.7372662200979,
 'r2': 0.7777721400083731,
 'smape': 0.16788841476800043,
 'adjusted_smape': 0.16788841476532032}

In [10]:
from howso.synthesizer import Synthesizer


In [11]:
with Synthesizer() as s:
    s.load_trainee(t.id)
    basic_synth = s.synthesize_cases(n_samples=len(df), generate_new_cases="always")
    display(basic_synth)

    with Validator(s.trainee, basic_synth) as val:
        reg_results = val.run_metrics()

The following parameters from your configuration will override the default Amalgam parameters: {'trace'}


Unnamed: 0,datetime_feature_year,datetime_feature_day_of_week,datetime_feature_minute,datetime_feature_day,datetime_feature_quarter,Random_4,datetime_feature_second,Random_3,datetime_feature_month,Random_0,Random_1,datetime_feature_hour,Random_2,target
0,2036,0,0,9,4,2,0,1.173021,11,-1.181948,0.505485,0,3.969175,371569.340176
1,2024,6,0,20,1,3,0,1.133165,7,0.644017,0.286136,0,5.875147,372435.896179
2,2039,5,0,26,1,7,0,2.792654,3,-0.286605,-0.602591,0,2.804219,371813.888008
3,2041,2,0,4,2,4,0,0.211770,9,-0.576089,0.437306,0,5.807871,368768.543886
4,2037,1,0,3,4,7,0,0.378827,12,0.986583,0.658922,0,7.649050,371671.290534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2034,2,0,25,4,5,0,3.251606,10,2.075022,1.888982,0,9.871844,371869.023614
9996,2026,0,0,26,4,7,0,0.310757,7,0.616522,1.192925,0,5.723490,370670.869018
9997,2050,5,0,4,2,2,0,0.792506,4,0.963004,1.030563,0,7.993495,373139.490870
9998,2049,5,0,21,1,5,0,0.457153,5,-0.406086,-0.763920,0,9.062500,375125.582247


Validator Enterprise is enabled. Running without parallelism.
AnonymityPreservation       : Beginning run
AnonymityPreservation       : Run completed with desirability=5.0
DescriptiveStatistics       : Beginning run
DescriptiveStatistics       : Run completed with desirability=4.433
JointProbability            : Beginning run
JointProbability            : Run completed with desirability=4.839
ModelComparison             : Beginning run
ModelComparison             : Run completed with desirability=1.949
Run finished with overall_desirability=3.802


In [12]:
for f_name in features_split.get_names(types=("nominal", "ordinal")):
    features_split[f_name]["non_sensitive"] = True

with Synthesizer() as s:
    s.train(df_split, features_split)
    basic_synth = s.synthesize_cases(n_samples=len(df_split), generate_new_cases="always")
    display(basic_synth)

    # with Validator(s.trainee, basic_synth) as val:
    #     split_results = val.run_metrics()

The following parameters from your configuration will override the default Amalgam parameters: {'trace'}


Unnamed: 0,target,Random_0,Random_1,Random_2,Random_3,Random_4,datetime_feature_day,datetime_feature_month,datetime_feature_year,datetime_feature_day_of_week,datetime_feature_quarter
0,369396.341722,-0.782715,0.498898,4.845478,4.481181,7,,,2023,,
1,373391.917710,-0.131385,0.106813,5.779613,0.217230,7,,,2044,,
2,371060.657616,0.326764,0.143287,7.740790,1.777948,5,,,2031,,
3,373104.454647,-1.911772,-0.509979,5.278448,0.682120,7,,,2033,,
4,370193.830097,-1.263588,-1.812259,5.917059,0.658609,4,,,2033,,
...,...,...,...,...,...,...,...,...,...,...,...
9995,370952.684814,0.909280,1.403733,9.697954,0.225193,6,,,2024,,
9996,374698.760805,0.608994,0.976527,3.987860,2.469936,5,,,2050,,
9997,369898.018358,0.377227,0.532110,8.260108,0.470390,4,,,2028,,
9998,368705.572738,0.242782,0.000975,5.366263,0.534464,7,,,2048,,
