In [None]:
import pandas as pd
import statistics
import numpy as np
import sys
from z3 import *
from sklearn.preprocessing import MinMaxScaler
from Supplements import *

folders = list(range(1, 11))
data_path = 'datasets/family_income_24122020.csv'
data = pd.read_csv(data_path, header=0).iloc[:, 1:]
data = data.iloc[:1000, :]
print(data.columns)

n_features = data.shape[1] - 1
n_instances = len(data)
y = data.iloc[:, -5:]
X = data.iloc[:, 0:data.shape[1] - 5]
X['Household Head Sex'] = [1 if x == 'Male' else 0 for x in X['Household Head Sex']]

scaler = MinMaxScaler()

X[X.columns] = scaler.fit_transform(X[X.columns])
y = [list(y.iloc[:, i]) for i in range(y.shape[1])]
K = [scaler.min_[-1], scaler.scale_[-1]]
print(X.head())

to_plot = pd.DataFrame()

In [None]:
# to count the number of violations in the data

count_violations(X, y, K)[0]

In [None]:
# average pairwise distance

from sklearn.metrics import pairwise_distances

def infy_pairwise_distance(a, b):
    return max(abs(a - b))

pairwise_distances(X, X, metric=infy_pairwise_distance).mean()

In [None]:
# adversity index

def sumproduct(X, W):
    out = W[0]
    for i in range(len(X)):
        out = out + X[i] * W[i + 1]
    return out

def adversity_index(X, parameters=None, K=None, num=10):
    output = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame()]
    if parameters is None:
        print('please provide a valid set of parameters')
        sys.exit()
    
    errors = [0.1, 0.01, 0.001]
    
    for n in range(len(errors)):
        
        print('running for error:', errors[n])
        s = Solver()
        _X = RealVector('x', X.shape[1])

        for i in range(X.shape[0]):

            if i % 500 == 0: print('tested for {} instances'.format(i))

            for k in [0, 1, 2, 5, 6, 7, 8, 9, 11]:
                s.add(Or([_X[k] ==  v for v in np.unique(list(X.iloc[:, k]))]))

            for k in [3, 4, 10, 12]:
                s.add(And(_X[k] <= max(list(X.iloc[:, k])), _X[k] >= min(list(X.iloc[:, k]))))

            s.add([And(_X[j] > list(X.iloc[i, :])[j] - errors[n],
                       _X[j] < list(X.iloc[i, :])[j] + errors[n]) for j in range(X.shape[1])])

            if K is not None:
                s.add(Sum([sumproduct(_X, parameters[j])
                               for j in range(len(parameters))]) > (_X[-1] - K[0])/K[1])

            out = s.check()
            if out == sat:
                solution = [s.model()[x].numerator_as_long() / s.model()[x].denominator_as_long() for x in _X]
                output[n] = output[n].append([[i+1] + solution])
            s.reset()

        if output[n].shape[0] > 0:
            output[n].columns = ['original_data_instance'] + list(X.columns)
            
    return output

In [None]:
out_b = pd.DataFrame()
for f in folders:
    print('Running for Experiment:', f)
    baseline_results = pd.read_csv(str(f) + '/all_baseline_outputs.csv').iloc[:, 1:]
    baseline_results = baseline_results[baseline_results['margin/alpha'] != 'PP']
    
    alphas = [0, 1, 2, 5, 10, 50, 100, 1000, 10000, 100000, 1000000, 10000000]
    processed_baseline_results = pd.DataFrame()

    for a in alphas:
        baseline_results_sub = baseline_results[baseline_results['margin/alpha'] == a]
        mean_accuracy_test = statistics.mean(list(baseline_results_sub['total_mse_test']))
        std_accuracy_test = statistics.stdev(list(baseline_results_sub['total_mse_test'])) 
        mean_corrected_accuracy_test = statistics.mean(list(baseline_results_sub['custom_metric_test_2']))
        std_corrected_accuracy_test = statistics.stdev(list(baseline_results_sub['custom_metric_test_2'])) 

        mean_val_violations = statistics.mean(list(baseline_results_sub['violations_val_prediction']))
        std_val_violations = statistics.stdev(list(baseline_results_sub['violations_val_prediction']))
        mean_test_violations = statistics.mean(list(baseline_results_sub['violations_test_prediction']))
        std_test_violations = statistics.stdev(list(baseline_results_sub['violations_test_prediction']))
        mean_counter_examples = statistics.mean(list(baseline_results_sub['counter_examples_found']))
        mean_runtime = statistics.mean(list(baseline_results_sub['runtime']))
        std_runtime = statistics.stdev(list(baseline_results_sub['runtime']))

        processed_baseline_results = processed_baseline_results.append([[a, mean_accuracy_test, std_accuracy_test, 
                                                                         mean_corrected_accuracy_test, std_corrected_accuracy_test,
                                                                         mean_val_violations, std_val_violations,
                                                                         mean_test_violations, std_test_violations,
                                                                         mean_counter_examples,
                                                                         mean_runtime, std_runtime]])
    
    processed_baseline_results.columns = ['alpha', 'average_total_mse', 'std_total_mse', 
                                          'average_corrected_mse', 'std_corrected_mse', 
                                          'mean val violations', 'stddev val violations',
                                         'mean test violations', 'stddev test violations', 'mean_CE',
                                          'mean_runtime', 'std_runtime']
    processed_baseline_results.reset_index(inplace=True, drop=True)
    selected_alpha = processed_baseline_results['alpha'][list(processed_baseline_results['mean val violations']).index(min(list(processed_baseline_results['mean val violations'])))]
    print(processed_baseline_results)
    print(selected_alpha)

    '''calculating the adversity index only for selecetd alpha'''
    baseline_results_sub = baseline_results[baseline_results['margin/alpha'] == selected_alpha]
    baseline_results_sub.reset_index(inplace=True, drop=True)
    adversity_index_values = [[], [], []]
    print(baseline_results_sub)
    
    for i in range(baseline_results_sub.shape[0]):
        print('Running for Fold:', i + 1)
        weights = list(baseline_results_sub.iloc[i, :])[7]
        weights = [[float(v) for v in w] for w in eval(weights)]
        counter_examples = adversity_index(X, weights, K=K)
        adversity_index_values[0].append(counter_examples[0].shape[0]/X.shape[0])
        adversity_index_values[1].append(counter_examples[1].shape[0]/X.shape[0])
        adversity_index_values[2].append(counter_examples[2].shape[0]/X.shape[0])
    
    print(adversity_index_values)
    out_b = out_b.append([[f, selected_alpha,
                           processed_baseline_results[processed_baseline_results['alpha'] == selected_alpha]['average_total_mse'].iloc[0],
                           processed_baseline_results[processed_baseline_results['alpha'] == selected_alpha]['std_total_mse'].iloc[0],
                           processed_baseline_results[processed_baseline_results['alpha'] == selected_alpha]['average_corrected_mse'].iloc[0],
                           processed_baseline_results[processed_baseline_results['alpha'] == selected_alpha]['std_corrected_mse'].iloc[0],
                           processed_baseline_results[processed_baseline_results['alpha'] == selected_alpha]['mean_runtime'].iloc[0],
                           processed_baseline_results[processed_baseline_results['alpha'] == selected_alpha]['std_runtime'].iloc[0],
                           statistics.mean(adversity_index_values[0]),
                           statistics.stdev(adversity_index_values[0]),
                           statistics.mean(adversity_index_values[1]),
                           statistics.stdev(adversity_index_values[1]),
                           statistics.mean(adversity_index_values[2]),
                           statistics.stdev(adversity_index_values[2])
                          ]])
    
out_b.columns = ['experiment', 'selected_alpha', 'average_total_mse', 'std_total_mse', 'average_corrected_mse', 'std_corrected_mse', 
                 'mean_runtime', 'std_runtime', 'average_adversity_index_0.1', 'std_adversity_index_0.1', 'average_adversity_index_0.01', 
                 'std_adversity_index_0.01', 'average_adversity_index_0.001', 'std_adversity_index_0.001']
print(out_b)
print(statistics.mean(out_b['average_total_mse']))
print(statistics.mean(out_b['std_total_mse']))

print(statistics.mean(out_b['average_corrected_mse']))
print(statistics.mean(out_b['std_corrected_mse']))

print(statistics.mean(out_b['average_adversity_index_0.001']))
print(statistics.mean(out_b['std_adversity_index_0.001']))

print(statistics.mean(out_b['average_adversity_index_0.01']))
print(statistics.mean(out_b['std_adversity_index_0.01']))

print(statistics.mean(out_b['average_adversity_index_0.1']))
print(statistics.mean(out_b['std_adversity_index_0.1']))

print(statistics.mean(out_b['mean_runtime']))
print(statistics.mean(out_b['std_runtime']))

to_plot = to_plot.append([['b', statistics.mean(out_b['average_total_mse']),
                          statistics.mean(out_b['std_total_mse']), 
                          statistics.mean(out_b['mean_runtime']), 
                          statistics.mean(out_b['std_runtime'])]])

In [None]:
out_l = pd.DataFrame()
for f in folders:
    results = pd.read_csv(str(f) + '/all_outputs_sade_l.csv').iloc[:, 1:]
    results = results[~(results['learning_rate'] == '-')]
    mean_runtime = statistics.mean([float(r) for r in results['runtime']])
    std_runtime = statistics.stdev([float(r) for r in results['runtime']])
    total_mse = [float(l) for l in list(results['total_mse_test'])]
    total_corrected_mse = [float(l) for l in list(results['custom_metric_test_2'])]
    out_l = out_l.append([[f, statistics.mean(total_mse), 
                           statistics.stdev(total_mse), 
                           statistics.mean(total_corrected_mse), 
                           statistics.stdev(total_corrected_mse),
                           mean_runtime, std_runtime]])

out_l.columns = ['experiment', 'average_total_mse', 'stdev_total_mse', 'average_corrected_mse', 'stdev_corrected_mse', 'mean_runtime', 'std_runtime']
print(out_l)
print(statistics.mean(out_l['average_total_mse']))
print(statistics.mean(out_l['stdev_total_mse']))

print(statistics.mean( out_l['average_corrected_mse']))
print(statistics.mean(out_l['stdev_corrected_mse']))

print(statistics.mean(out_l['mean_runtime']))
print(statistics.mean(out_l['std_runtime']))

### Plots

In [None]:
import pickle
import matplotlib.pyplot as plt

for f in folders:
    with (open(str(f) + "/all_test_losses_sade_l.pkl", "rb")) as openfile:
        out = pickle.load(openfile)
    out = [[float(i) for i in l] for l in out]
    folds = 5

    plt.rcParams["font.weight"] = "bold"
    plt.rcParams["axes.labelweight"] = "bold"
    plt.rcParams["axes.titleweight"] = "bold"
    plt.rcParams['font.size'] = 14
    plt.rc('figure', figsize=(10, 5))

    for e in range(folds):
        plt.figure()
        plt.plot(out[e])
        plt.xlabel('Iterations')
        plt.ylabel('Loss')
        plt.title('Expense Prediction (experiment = {}, fold = {})'.format(f, e + 1))
        name = str(f) + '/expenses_data_{}.jpeg'.format(e+1)
        plt.savefig('{}'.format(name))
        plt.show()
        plt.close()