In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, mean_squared_error, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB




# Basic preprocessing
def preprocess(df):
    df_new = df.copy()
    
    # Discard identifiers, style information, timestamps
    df_new = df_new[df_new.columns.difference(['image', 'style', 'reviewTime', 
                                               'reviewerID', 'asin', 'reviewerName', 'unixReviewTime'])]

    # Turn category into binary features
    for cat in df_new.category.unique():
        df_new[cat] = df_new['category'] == cat

    # Drop category column
    df_new.drop(columns=['category'], inplace=True)

    # NaN vote is 0 users found helpful
    df_new.vote.fillna(0, inplace=True)
    
    # Turn vote into binary feature
    df_new.vote = df_new.vote > 0
    # df_new.vote.clip(0, 10)
    # df_new.vote = df_new.vote / 10

    # NaN summary is empty summary
    df_new.summary.fillna('', inplace=True)

    # Turn Booleans into binary variables
    df_new.replace({False: 0, True: 1}, inplace=True)
    
    return df_new

# Remove 'overall' column and add cutoff column applying cutoff
def apply_cutoff(df, cutoff):
    df_new = df.copy()
    
    # Apply cutoff
    cut = df['overall'] > cutoff
    df_new['cutoff'] = cut

    # Drop overall and category
    df_new.drop(columns=['overall'], inplace=True)
    
    # Turn Booleans into binary variables
    df_new.replace({False: 0, True: 1}, inplace=True)
    
    return df_new
def apply_tfidf(df, review_vectorizer, summary_vectorizer):
    review_matrix = pd.DataFrame(data=review_vectorizer.transform(df.reviewText).toarray(), columns='R_' + review_vectorizer.get_feature_names_out())
    summary_matrix = pd.DataFrame(data=summary_vectorizer.transform(df.summary).toarray(), columns='S_' + summary_vectorizer.get_feature_names_out())
    df_new = pd.concat([df, review_matrix, summary_matrix], axis=1)
    df_new.drop(columns=['summary', 'reviewText'], inplace=True)
    return df_new


# Processing the data - I
# Preprocessing of training data
def load_and_process_data():
    training_df = pd.read_csv('../data/Training.csv')
    test_df = pd.read_csv('../data/Test.csv')

    proc_training_df = apply_cutoff(preprocess(training_df), 1)

    # Set cutoff to be the label; define data_x and y accordingly
    data_x = proc_training_df.drop('cutoff', axis=1)
    data_y = proc_training_df['cutoff']

    # Fit TF-IDF vectorizer for 'reviewText' and 'summary' features, creating max. 11500 features.
    r_vectorizer = TfidfVectorizer(max_features=11500, stop_words='english', ngram_range=(1, 3))
    s_vectorizer = TfidfVectorizer(max_features=11500, stop_words='english', ngram_range=(1, 3))
    r_vectorizer.fit(data_x.reviewText)
    s_vectorizer.fit(data_x.summary)

    # Apply TF-IDF vectorization 
    data_x = apply_tfidf(data_x, r_vectorizer, s_vectorizer)

    # Apply robust scaling
    scaler = preprocessing.RobustScaler()
    data_x = pd.DataFrame(scaler.fit_transform(data_x), columns=data_x.columns, index=data_x.index)

    # Let us reduce the number of features by eliminating the statistically least correlated ones.
    relcols = data_x.columns[abs(data_x.corrwith(data_y)) > 0.01]


    # We will go with these columns.
    data_x = data_x[relcols]

    return data_x, data_y

def create_splits(data_x, data_y, n_splits):
    # 5-fold cross validation
    kf = KFold(n_splits=n_splits, shuffle=True)
    splits = []
    for train_idx, val_idx in kf.split(data_x, data_y):
        # Apply split
        x_train, x_val = data_x.iloc[train_idx], data_x.iloc[val_idx]
        y_train, y_val = data_y.iloc[train_idx], data_y.iloc[val_idx]
        
        # Reset indices
        x_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        x_val.reset_index(drop=True, inplace=True)
        y_val.reset_index(drop=True, inplace=True)
        splits.append((x_train, x_val, y_train, y_val))
    return splits



In [None]:

# X_DATA, Y_DATA = load_and_process_data()

# test_splits = create_splits(X_DATA, Y_DATA, 5)

# print(len(test_splits[0][1]), len(X_DATA))

# Define and create the population data

In [None]:
Q_MEAN = 0.5
EPOCH = 2
POPULATION_SIZE = 500000
TRAIN_DIM = 100
STEP = 1

In [None]:
import numpy as np

def initialize_with_str_seed(init_str):
    """
    Initializes random number generator with seed corresponding to given input string init_str.
    :param init_str: Initialization string according to which seed will be computed. Seed is the sum of the ASCII
                     values of each character in init_str.
    """
    rnd_val = 0
    if init_str:
        for c in init_str:
            rnd_val += ord(c)
    np.random.seed(rnd_val)

def gen_data(n, d, seed = None):
    if seed:
        initialize_with_str_seed(seed)
    p = (1.0 + np.sqrt(max(2 * Q_MEAN - 1, 1 - 2 * Q_MEAN))) / 2 
    data = np.random.choice([-1, 1], (n, d), p=[1 -p, p])
    data_y = np.random.choice([0, 1], n, p=[1 -p, p])
    return data, data_y

def gen_valid(n, d, seed = None):
    if seed:
        initialize_with_str_seed(seed)
    
    n = int(n/10)
    
    p = (1.0 + np.sqrt(max(2 * Q_MEAN - 1, 1 - 2 * Q_MEAN))) / 2 
    data = np.random.choice([-1, 1], (n, d), p=[1 -p, p])
    data_y = np.random.choice([0, 1], n, p=[1 -p, p])
    return data, data_y

In [None]:
x_population, y_population = gen_data(POPULATION_SIZE, TRAIN_DIM)
x_valid, y_valid = gen_data(int(POPULATION_SIZE/500), TRAIN_DIM)

# Define the evaluation functions

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [None]:
import sys
sys.path.append("..")
from mechanism.mechanized_models import Mechanism
from mechanism.mechanized_models import MechanizedGaussianNB, MechanizedLogisticRegression, MechanizedOneVSRest, MechanizedDecisionTree
from mechanism.mechanized_models import MechanizedGridSearchCV

def hyper_parameter(splits):
    x_train, x_val, y_train, y_val = splits[0]
    estimator = MechanizedLogisticRegression(max_iter=1500)
    estimator.choose_mechanism(Mechanism.GAUSSIAN)
    gs_cls = MechanizedOneVSRest(estimator = estimator)
    gs_cls.choose_mechanism(Mechanism.GAUSSIAN)

    params_LR = {'estimator__C': np.logspace(-0.2, 0.7, num = 10)}
    gs_LR = GridSearchCV(estimator=gs_cls, param_grid=params_LR, cv = 2, verbose=2, scoring='f1_macro')
    gs_LR.fit(x_train, y_train)


BEST_C = 1.5848931924611134
'''
C=1.5848931924611134, max_iter=1500
'''

def generalization_error(true_val, pred_val):
    return np.sqrt(abs(mean_squared_error(true_val, pred_val) - Q_MEAN))



def eval_multiple_rounds(stepped_rounds, mechanism, non_adaptive_num):
    generalization_error_list = []
    for r in stepped_rounds:
        estimator = MechanizedLogisticRegression(C = BEST_C, max_iter = r, mechanism = mechanism, solver = 'sag')
        generalization_error_list.append(eval(non_adaptive_num, estimator, mechanism))
    return generalization_error_list



# Evaluation the O(1) adaptivity program

In [None]:
def eval_const(round, train_size, mechanism):
    # f1_scores, acc_scores, models = [], [], [], [], []
    x_train, y_train = x_population[:train_size], y_population[:train_size]    
    model = MechanizedLogisticRegression(C = BEST_C, max_iter = round, mechanism = mechanism)

    model.fit(x_train, y_train)
    # Predict
    y_pred = model.predict(x_valid)

    return generalization_error(y_valid, y_pred)

def eval_const_rounds(round, mechanism, stepped_non_adaptive_num):
    generalization_error_list = []
    for train_size in stepped_non_adaptive_num:
        generalization_error_list.append(eval_const(round, train_size, mechanism))
    return generalization_error_list

In [None]:
stepped_non_adaptive_num = range(1000, 100000, 20)
round = 5
stepped_non_adaptive_num = range(1000, 1010, 10)


In [None]:

baseline_generalization_error_list = eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.NONE), stepped_non_adaptive_num)
# baseline_generalization_error_list = [0.47190818773552584, 0.473107228502912, 0.473792394655704, 0.47070914696813976, 0.4659129838985954, 0.4720794792737239, 0.47447756080849607, 0.473107228502912, 0.4775608084960603, 0.47619047619047616, 0.4712230215827338, 0.47447756080849607, 0.47567660157588215, 0.473792394655704, 0.47533401849948614]
print(np.mean(baseline_generalization_error_list))

In [None]:
gaussian_generalization_error_list = eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.GAUSSIAN, sigma = 0.5), stepped_non_adaptive_num)
# gaussian_generalization_error_list = [0.3984241178485783, 0.4035628639945187, 0.39482699554642003, 0.3977389516957862, 0.3977389516957862, 0.3917437478588558, 0.3975676601575882, 0.39585474477560806, 0.38866050017129156, 0.3970537855429942, 0.39568345323741005, 0.3857485440219253, 0.38506337786913325, 0.39157245632065774, 0.3871188763275094]

print(gaussian_generalization_error_list)


In [None]:
threshold_generalization_error_list = eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.THRESHOLD, sigma = 0.1, hold_frac = 0.7, threshold = 0.8), stepped_non_adaptive_num)
# gaussian_generalization_error_list = [0.3984241178485783, 0.4035628639945187, 0.39482699554642003, 0.3977389516957862, 0.3977389516957862, 0.3917437478588558, 0.3975676601575882, 0.39585474477560806, 0.38866050017129156, 0.3970537855429942, 0.39568345323741005, 0.3857485440219253, 0.38506337786913325, 0.39157245632065774, 0.3871188763275094]

print(threshold_generalization_error_list)
print(np.mean(threshold_generalization_error_list))

In [None]:
data_split_generalization_error_list = eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.DATASPLIT), stepped_non_adaptive_num)
print(np.mean(data_split_generalization_error_list))

In [None]:
gaussian_generalization_error_list = eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.GAUSSIAN, sigma = 0.5), stepped_non_adaptive_num)
print(np.mean(gaussian_generalization_error_list))

In [24]:

class MechanizedLogisticRegression(LogisticRegression):

    def __init__(self, penalty="l2", *, dual=False, tol=0.0001, C=1, 
                 fit_intercept=True, intercept_scaling=1, class_weight=None,
                 random_state=None, solver="lbfgs", max_iter=100, multi_class="auto", 
                 verbose=0, warm_start=False, n_jobs=None, 
                 l1_ratio=None,
                 mechanism = Mechanism(Mechanism.MechanismType.NONE)):
        super(MechanizedLogisticRegression, self).__init__(penalty, dual=dual, 
                                                           tol = tol, C = C, fit_intercept = fit_intercept, 
                                                           intercept_scaling = intercept_scaling,
                                                           class_weight = class_weight, 
                                                           random_state = random_state, 
                                                           solver = solver, 
                                                           max_iter = max_iter, 
                                                           multi_class = multi_class, 
                                                           verbose = verbose, 
                                                           warm_start = warm_start, 
                                                           n_jobs = n_jobs, 
                                                           l1_ratio = l1_ratio)
        self.mechanism = mechanism

    def fit_data_split(self, x_train, y_train):
        size = len(x_train)
        hold_size, train_size = int(size  * (self.mechanism.hold_frac)), int(size  * (1.0 - self.mechanism.hold_frac))
        x_train, y_train, x_hold, y_hold = x_train[hold_size:], y_train[hold_size:], x_train[:hold_size], y_train[:hold_size]
        train_result = super(MechanizedLogisticRegression, self).fit(x_train, y_train)
        train_pred = train_result.predict(x_train)
        hold_result = super(MechanizedLogisticRegression, self).fit(x_hold, y_hold)
        hold_pred = hold_result.predict(x_hold)
        if abs(accuracy_score(train_pred, y_train) - accuracy_score(hold_pred, y_hold)) >= self.mechanism.noisy_thresh + np.random.laplace(0, 4 * self.mechanism.sigma):
            self.mechanism.noisy_thresh = self.mechanism.threshold + np.random.laplace(0, 2 * self.mechanism.sigma)
            x_noise =  np.random.laplace(0, 2 * self.mechanism.sigma, x_hold.shape)
            return super(MechanizedLogisticRegression, self).fit(x_hold + x_noise, y_hold)
        else:
            return train_result

    def fit_threshold(self, x_train, y_train):
        size = len(x_train)
        hold_size, train_size = int(size  * (self.mechanism.hold_frac)), int(size  * (1.0 - self.mechanism.hold_frac))
        x_train, y_train, x_hold, y_hold = x_train[hold_size:], y_train[hold_size:], x_train[:hold_size], y_train[:hold_size]
        train_result = super(MechanizedLogisticRegression, self).fit(x_train, y_train)
        train_pred = train_result.predict(x_train)
        hold_result = super(MechanizedLogisticRegression, self).fit(x_hold, y_hold)
        hold_pred = hold_result.predict(x_hold)
        if abs(accuracy_score(train_pred, y_train) - accuracy_score(hold_pred, y_hold)) >= self.mechanism.noisy_thresh + np.random.laplace(0, 4 * self.mechanism.sigma):
            self.mechanism.noisy_thresh = self.mechanism.threshold + np.random.laplace(0, 2 * self.mechanism.sigma)
            x_noise =  np.random.laplace(0, 2 * self.mechanism.sigma, x_hold.shape)
            return super(MechanizedLogisticRegression, self).fit(x_hold + x_noise, y_hold)
        else:
            return train_result


    def fit_gaussian(self, x_train, y_train):
        x_noise = np.random.normal(0, self.mechanism.sigma, x_train.shape) 
        noised_x = x_train + x_noise        
        
        ################ Gaussian Noise Added to Labels ################
        y_noise = np.random.normal(0, self.mechanism.sigma, y_train.shape) 
        noised_y = y_train + y_noise

        noised_y = (map(lambda x : 0 if x < 0 else 1, noised_y))
        print( y_train + y_noise, noised_y)


        result = super(MechanizedLogisticRegression, self).fit(noised_x, noised_y)
        if isinstance(result, LogisticRegression):
            return self
        else:
            return result



    def fit(self, x_train, y_train):
        if self.mechanism.mechanism_type ==  Mechanism.MechanismType.NONE:
            print("in Baseline Logistic Regression")
            result = super(MechanizedLogisticRegression, self).fit(x_train, y_train)
            if isinstance(result, LogisticRegression):
                return self
            else:
                return result
        elif self.mechanism.mechanism_type ==  Mechanism.MechanismType.GAUSSIAN:
            print("in Gaussian Mechanized Logistic Regression")
            return self.fit_gaussian(x_train, y_train)
        
        elif self.mechanism.mechanism_type ==  Mechanism.MechanismType.THRESHOLD:
            print("in Threshold Mechanized Logistic Regression")
            return self.fit_threshold(x_train, y_train)
        else:
            result = super(MechanizedLogisticRegression, self).fit(x_train, y_train)
            if isinstance(result, LogisticRegression):
                return self
            else:
                return result
           
    def choose_mechanism(self, mech):
        self.mechanism = mech

In [25]:
gaussian_generalization_error_list = [eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.GAUSSIAN, sigma = 0.03), stepped_non_adaptive_num)
                                      for _ in range(100)]
print(np.mean(gaussian_generalization_error_list, axis = 0))


baseline_generalization_error_list = [eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.NONE), stepped_non_adaptive_num)
                                      for _ in range(100)]

print(np.mean(baseline_generalization_error_list, axis = 0))



threshold_generalization_error_list = [eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.THRESHOLD, sigma = 0.1, hold_frac = 0.7, threshold = 0.8), stepped_non_adaptive_num)
                                       for _ in range(100)]

print(threshold_generalization_error_list)
print(np.mean(threshold_generalization_error_list, axis = 0))

data_split_generalization_error_list = [eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.DATASPLIT), stepped_non_adaptive_num)
                                        for _ in range(100)]
print(np.mean(data_split_generalization_error_list, axis = 0))


in Gaussian Mechanized Logistic Regression
[-1.11283209e-02  1.01340991e+00  9.72771926e-01  1.01026929e+00
 -1.77416899e-02 -3.93874303e-02  2.78072718e-02 -9.36532287e-03
  1.88443142e-02 -3.89553733e-03  6.43627306e-03 -4.25852500e-02
 -2.83779061e-02 -3.13361221e-02  2.73668044e-02 -1.75324766e-02
 -3.18778572e-02  9.89953440e-01  9.90800158e-01  1.80311251e-02
  1.04048547e+00  7.33315158e-02  9.74419738e-01  5.09845250e-02
  9.30166951e-01 -1.70295273e-02  1.01546813e+00  1.00572311e+00
  1.03332437e+00  6.60577845e-04  9.93589281e-01 -4.36127381e-03
 -4.77512271e-02  9.28881176e-01 -2.14466495e-04  2.22937136e-02
  9.39982460e-01  1.05556651e+00 -2.48163165e-02  9.93242129e-01
  9.07524176e-03  6.34377248e-02  4.73468270e-02  4.21363059e-02
  1.03576252e+00 -4.82956486e-02  9.76014671e-01  9.59106937e-01
  1.00809923e+00  1.89961825e-02  3.96658885e-02  3.87969197e-02
 -2.92738449e-02 -1.63973819e-02  1.46313034e-02  1.44262571e-02
  1.03968554e+00  9.90703034e-01  9.43636972e-0

ValueError: y should be a 1d array, got an array of shape () instead.

In [None]:
print(np.mean(baseline_generalization_error_list, axis = 0))
print(np.mean(gaussian_generalization_error_list, axis = 0))
print(np.mean(threshold_generalization_error_list, axis = 0))

print(np.mean(data_split_generalization_error_list, axis = 0))

In [None]:

def plot_error(rounds, generalization_error, mechanism, color = None):
    plt.plot(rounds, generalization_error, color, label = mechanism)
    plt.xlabel("Queries")
    plt.ylabel("RMSE (Generalization Error) for adaptive queries")
    plt.legend()
    plt.grid()


In [None]:
import math

plt.figure()
x_range = stepped_non_adaptive_num
plot_error(x_range, baseline_generalization_error_list, "Emripircal", 'g')
plot_error(x_range, gaussian_generalization_error_list, "Gaussian", 'r')
plot_error(x_range, threshold_generalization_error_list, "Threshold - Adaptfun", "y")
plt.savefig("../plots/c_adaptivity.png")
plt.show()


In [None]:
threshold_generalization_error_list = eval_const_rounds(round, Mechanism(mechanism_type = Mechanism.MechanismType.THRESHOLD, sigma = 0.1, hold_frac = 0.7, threshold = 0.8), stepped_non_adaptive_num)
