In [1]:
import numpy as np
import scipy as sp
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

# YearPredictionMSD Dataset

In [28]:
# reading data
data = pd.read_csv("year_prediction.csv")
data = data.rename(index=str, columns={"label":"year"})

In [29]:
# separate input attributes and output into different dataframes
X = data.iloc[:,1:]
Y = data.iloc[:,0]

# Train set
X_train = X.iloc[0:463715,:]
y_train = Y.iloc[0:463715]

# Validation set
X_test = X.iloc[463715:,:]
y_test = Y.iloc[463715:]

In [30]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

# if we want to standarize the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [256]:
num_of_rep = 20

linearReg = Ridge(alpha=46371.5)

train_res = np.zeros(num_of_rep)
test_res = np.zeros(num_of_rep)

for t in range(num_of_rep):
    linearReg.fit(X_train, y_train)
    pred_train = linearReg.predict(X_train)
    total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)
    print(total_cost_train)

    pred_test = linearReg.predict(X_test)
    total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    print(total_cost_test)
    
    train_res[t] = total_cost_train
    test_res[t] = total_cost_test
    
name_str1 = "./res/ridge/ridge-centralized-train.npy"
name_str2 = "./res/ridge/ridge-centralized-test.npy" 
np.save(name_str1, train_res)
np.save(name_str2, test_res)

91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518


In [182]:
def gd(X, y, lamb = 0.1, eta = 0.1, T = 1000, gamma = 0.99):
    num_data, d = X.shape
    y = y.reshape(-1,1)
    m = np.zeros((d,1))
    res = np.zeros((d,1))
    for t in range(T):
        #print(res)
        if t % 10000 == 0:
            print(t)
        grad = X.T @ X @ res / num_data - X.T @ y / num_data + lamb * res
        m = gamma * m + (1-gamma) * grad
        res = res - eta * m
    return res

In [150]:
res = gd(X_train_std, y_train)

pred_train = (X_train_std @ res).reshape(-1)
total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + np.sum((res.reshape(-1)) ** 2)
print(total_cost_train)

pred_test = (X_test_std @ res).reshape(-1)
total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
print(total_cost_test)

0
100
200
300
400
500
600
700
800
900
3993671.583046587
3993751.2675514827


In [221]:
# construct coreset
def coreset_lr(m, D, y):
    # m is the number of samples, X is the data
    num_of_data, _ = D.shape
    D = np.hstack((D,y.reshape(-1,1)))
    #s = np.sum(D ** 2, axis=1) + 1 / num_of_data
    D1 = D[:,:30]
    D2 = D[:,30:60]
    D3 = D[:,60:]
    q1, _ = np.linalg.qr(D1)
    q2, _ = np.linalg.qr(D2)
    q3, _ = np.linalg.qr(D3)
    Q = np.hstack((q1,q2,q3))
    
    s = np.sum(Q ** 2, axis=1) + 1/num_of_data
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D = np.hstack((D, (1/s).reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-2]
    label = C[:,-2]
    weight = C[:,-1]
    weight = weight / np.sum(weight) * m * 10
    return data, label, weight

def uniform_lr(m,D,y):
    D = np.hstack((D, y.reshape(-1,1)))
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    data = C[:,:-1]
    label = C[:,-1]
    return data, label

In [250]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

linearReg = Ridge(alpha=46371.5)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/ridge/ridge-centralized-c"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-centralized-c"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [257]:
num_of_rep = 20
size_list = [1200]

linearReg = Ridge(alpha=46371.5)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)
        linearReg.fit(uniform_X, uniform_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/ridge/ridge-centralized-u"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-centralized-u"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [258]:
num_of_rep = 20
size_list = [1000]

new_lr = Ridge(alpha=46371.5,solver='saga',max_iter=10000,tol=1e-4)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)

        new_lr.fit(coreset_X, coreset_y, weight)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/ridge/ridge-saga-c"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-saga-c"+str(size)+"s-test.npy" 
    name_str3 = "./res/ridge/ridge-saga-c"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)

In [261]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):

        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)

        new_lr = Ridge(alpha=46371.5,solver='saga',max_iter=10000,tol=8e-5)
        new_lr.fit(uniform_X, uniform_y)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630

        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/ridge/ridge-saga-u"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-saga-u"+str(size)+"s-test.npy" 
    name_str3 = "./res/ridge/ridge-saga-u"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)



In [286]:
a = 1
for i in range(1000):
    a *= (500000-i) / 500000
print(a)

0.3680022108329327


# Different Regularizer

### Linear regression

In [27]:
import numpy as np
import scipy as sp
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [37]:
num_of_rep = 20

linearReg = Ridge(alpha=0)

train_res = np.zeros(num_of_rep)
test_res = np.zeros(num_of_rep)

for t in range(num_of_rep):
    linearReg.fit(X_train, y_train)
    pred_train = linearReg.predict(X_train)
    total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715
    print(total_cost_train)

    pred_test = linearReg.predict(X_test)
    total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    print(total_cost_test)
    
    train_res[t] = total_cost_train
    test_res[t] = total_cost_test
    
name_str1 = "./res/lr/lr-centralized-train.npy"
name_str2 = "./res/lr/lr-centralized-test.npy" 
np.save(name_str1, train_res)
np.save(name_str2, test_res)

91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051
91.25642665611488
90.44315668226051


In [32]:
# construct coreset
def coreset_lr(m, D, y):
    # m is the number of samples, X is the data
    num_of_data, _ = D.shape
    D = np.hstack((D,y.reshape(-1,1)))
    #s = np.sum(D ** 2, axis=1) + 1 / num_of_data
    D1 = D[:,:30]
    D2 = D[:,30:60]
    D3 = D[:,60:]
    q1, _ = np.linalg.qr(D1)
    q2, _ = np.linalg.qr(D2)
    q3, _ = np.linalg.qr(D3)
    Q = np.hstack((q1,q2,q3))
    
    s = np.sum(Q ** 2, axis=1) + 1/num_of_data
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D = np.hstack((D, (1/s).reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-2]
    label = C[:,-2]
    weight = C[:,-1]
    weight = weight / np.sum(weight) * m * 10
    return data, label, weight

def uniform_lr(m,D,y):
    D = np.hstack((D, y.reshape(-1,1)))
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    data = C[:,:-1]
    label = C[:,-1]
    return data, label

In [33]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

linearReg = Ridge(alpha=0)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/lr/lr-centralized-c"+str(size)+"s-train.npy"
    name_str2 = "./res/lr/lr-centralized-c"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [48]:
num_of_rep = 20
size_list = [1200]

linearReg = Ridge(alpha=0)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)
        linearReg.fit(uniform_X, uniform_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/lr/lr-centralized-u"+str(size)+"s-train.npy"
    name_str2 = "./res/lr/lr-centralized-u"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [35]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

new_lr = Ridge(alpha=0,solver='saga',max_iter=10000,tol=1e-4)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)

        new_lr.fit(coreset_X, coreset_y, weight)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/lr/lr-saga-c"+str(size)+"s-train.npy"
    name_str2 = "./res/lr/lr-saga-c"+str(size)+"s-test.npy" 
    name_str3 = "./res/lr/lr-saga-c"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)

In [46]:
num_of_rep = 20
size_list = [1200]

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):

        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)

        new_lr = Ridge(alpha=0,solver='saga',max_iter=10000,tol=8e-5)
        new_lr.fit(uniform_X, uniform_y)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630

        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/lr/lr-saga-u"+str(size)+"s-train.npy"
    name_str2 = "./res/lr/lr-saga-u"+str(size)+"s-test.npy" 
    name_str3 = "./res/lr/lr-saga-u"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)



### Lasso

In [60]:
num_of_rep = 20

linearReg = Lasso(alpha=1)

train_res = np.zeros(num_of_rep)
test_res = np.zeros(num_of_rep)

for t in range(num_of_rep):
    linearReg.fit(X_train, y_train)
    pred_train = linearReg.predict(X_train)
    total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 2 * np.sum(np.abs(linearReg.coef_))
    print(total_cost_train)

    pred_test = linearReg.predict(X_test)
    total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    print(total_cost_test)
    
    train_res[t] = total_cost_train
    test_res[t] = total_cost_test
    
name_str1 = "./res/lasso/lasso-centralized-train.npy"
name_str2 = "./res/lasso/lasso-centralized-test.npy" 
np.save(name_str1, train_res)
np.save(name_str2, test_res)

94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575
94.38829104334056
90.84159784296575


In [61]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

linearReg = Lasso(alpha=1)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 2 * np.sum(np.abs(linearReg.coef_))

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/lasso/lasso-centralized-c"+str(size)+"s-train.npy"
    name_str2 = "./res/lasso/lasso-centralized-c"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [66]:
num_of_rep = 20
size_list = [1200]

linearReg = Lasso(alpha=1)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)
        linearReg.fit(uniform_X, uniform_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 2 * np.sum(np.abs(linearReg.coef_))

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/lasso/lasso-centralized-u"+str(size)+"s-train.npy"
    name_str2 = "./res/lasso/lasso-centralized-u"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

### Elastic Net

In [63]:
num_of_rep = 20

linearReg = ElasticNet(alpha=1)

train_res = np.zeros(num_of_rep)
test_res = np.zeros(num_of_rep)

for t in range(num_of_rep):
    linearReg.fit(X_train, y_train)
    pred_train = linearReg.predict(X_train)
    total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 2 * np.sum(np.abs(linearReg.coef_)) \
                                                                    + 1 * np.sum(linearReg.coef_ ** 2)
    print(total_cost_train)

    pred_test = linearReg.predict(X_test)
    total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    print(total_cost_test)
    
    train_res[t] = total_cost_train
    test_res[t] = total_cost_test
    
name_str1 = "./res/en/en-centralized-train.npy"
name_str2 = "./res/en/en-centralized-test.npy" 
np.save(name_str1, train_res)
np.save(name_str2, test_res)

95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956
95.14443344206146
90.64876773050956


In [64]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

linearReg = ElasticNet(alpha=1)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 2 * np.sum(np.abs(linearReg.coef_)) \
                                                                    + 1 * np.sum(linearReg.coef_ ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/en/en-centralized-c"+str(size)+"s-train.npy"
    name_str2 = "./res/en/en-centralized-c"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [68]:
num_of_rep = 20
size_list = [1200]

linearReg = ElasticNet(alpha=1)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)
        linearReg.fit(uniform_X, uniform_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715  + 2 * np.sum(np.abs(linearReg.coef_)) \
                                                                    + 1 * np.sum(linearReg.coef_ ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/en/en-centralized-u"+str(size)+"s-train.npy"
    name_str2 = "./res/en/en-centralized-u"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

# Changing the number of parties

In [5]:
# construct coreset
def coreset_lr(m, D, y):
    # m is the number of samples, X is the data
    num_of_data, _ = D.shape
    D = np.hstack((D,y.reshape(-1,1)))
    #s = np.sum(D ** 2, axis=1) + 1 / num_of_data
    D1 = D[:,:18]
    D2 = D[:,18:36]
    D3 = D[:,36:54]
    D4 = D[:,54:72]
    D5 = D[:,72:]
    q1, _ = np.linalg.qr(D1)
    q2, _ = np.linalg.qr(D2)
    q3, _ = np.linalg.qr(D3)
    q4, _ = np.linalg.qr(D4)
    q5, _ = np.linalg.qr(D5)
    Q = np.hstack((q1,q2,q3,q4,q5))
    
    s = np.sum(Q ** 2, axis=1) + 1/num_of_data
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D = np.hstack((D, (1/s).reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-2]
    label = C[:,-2]
    weight = C[:,-1]
    weight = weight / np.sum(weight) * m * 10
    return data, label, weight

def uniform_lr(m,D,y):
    D = np.hstack((D, y.reshape(-1,1)))
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    data = C[:,:-1]
    label = C[:,-1]
    return data, label

In [8]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

linearReg = Ridge(alpha=46371.5)

for size in size_list:
    print(size)
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        print("train: %f, test: %f" % (total_cost_train, total_cost_test))
    
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/ridge/ridge-centralized-c"+str(size)+"s5p-train.npy"
    name_str2 = "./res/ridge/ridge-centralized-c"+str(size)+"s5p-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

1000
train: 100.602592, test: 100.168502
train: 102.726290, test: 101.746027
train: 100.190831, test: 100.581683
train: 100.925149, test: 101.519641
train: 98.715489, test: 98.610835
train: 103.211397, test: 103.100215
train: 100.853000, test: 98.939176
train: 99.922587, test: 100.221603
train: 100.125683, test: 99.041462
train: 99.121707, test: 98.863230
train: 99.611240, test: 100.109695
train: 100.799495, test: 100.126592
train: 102.718951, test: 101.844370
train: 101.203837, test: 100.196024
train: 98.621730, test: 97.915289
train: 99.226060, test: 98.161589
train: 100.599269, test: 100.321632
train: 98.939823, test: 99.000881
train: 98.276901, test: 98.203365
train: 102.457646, test: 101.529481
2000
train: 96.181189, test: 95.250864
train: 97.073648, test: 95.965735
train: 94.433816, test: 93.889145
train: 96.153451, test: 94.935709
train: 96.149676, test: 94.856944
train: 95.539058, test: 94.844299
train: 96.135047, test: 95.983307
train: 96.177512, test: 95.148110
train: 96.5107

In [9]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

new_lr = Ridge(alpha=46371.5,solver='saga',max_iter=10000,tol=1e-4)

for size in size_list:
    print(size)
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)

        new_lr.fit(coreset_X, coreset_y, weight)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        print("train: %f, test: %f" % (total_cost_train, total_cost_test))
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/ridge/ridge-saga-c"+str(size)+"s5p-train.npy"
    name_str2 = "./res/ridge/ridge-saga-c"+str(size)+"s5p-test.npy" 
    name_str3 = "./res/ridge/ridge-saga-c"+str(size)+"s5p-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)

1000
train: 102.895775, test: 101.610343
train: 110.382416, test: 108.785491
train: 104.273001, test: 103.116966
train: 105.263800, test: 104.462326
train: 107.954736, test: 106.198853
train: 105.368100, test: 104.037589
train: 108.937724, test: 108.240414
train: 102.118971, test: 101.552341
train: 108.114013, test: 107.284371
train: 108.954266, test: 107.398835
train: 104.983395, test: 102.859062
train: 106.404377, test: 104.507769
train: 104.494461, test: 103.815747
train: 108.854946, test: 107.712723
train: 107.918470, test: 107.243497
train: 107.559854, test: 106.260797
train: 109.957597, test: 108.672637
train: 104.023270, test: 104.322798
train: 112.069245, test: 111.101509
train: 106.111600, test: 104.517786
2000
train: 101.459941, test: 100.618179
train: 101.534472, test: 100.935160
train: 99.538893, test: 98.629866
train: 98.653921, test: 98.129853
train: 101.763404, test: 101.450749
train: 100.048763, test: 98.355807
train: 99.401724, test: 98.172638
train: 103.842711, test: 

# Ridge Regression on KC House Data

In [2]:
# reading data
data = pd.read_csv("kc_house_data.csv")

data.describe()

y = data.iloc[:,2].to_numpy()
X = data.iloc[:,3:].to_numpy()

num_of_data, _ = X.shape
print(X.shape)

print(y[:10])

(21613, 18)
[ 221900.  538000.  180000.  604000.  510000. 1225000.  257500.  291850.
  229500.  323000.]


In [12]:
num_of_rep = 20

linearReg = Ridge(alpha=0)

train_res = np.zeros(num_of_rep)

for t in range(num_of_rep):
    linearReg.fit(X, y)
    pred_train = linearReg.predict(X)
    total_cost_train = np.sum((pred_train - y) ** 2) / 21613
    print(np.sqrt(total_cost_train))
    
    train_res[t] = total_cost_train
    print(linearReg.coef_)
    
name_str1 = "./res/ridge-kc/ridge-centralized-train.npy"
np.save(name_str1, train_res)

print("saga")

train_res = np.zeros(num_of_rep)
iter_res = np.zeros(num_of_rep)

saga_lr = Ridge(alpha=0,solver='saga',max_iter=10,tol=1e-6)
for t in range(num_of_rep):
    saga_lr.fit(X, y)
    pred_train = linearReg.predict(X)
    total_cost_train = np.sum((pred_train - y) ** 2) / 21613
    print(np.sqrt(total_cost_train))
    
    train_res[t] = total_cost_train
    print(linearReg.coef_)
    
    iter_res[t] = saga_lr.n_iter_[0]
    
name_str1 = "./res/ridge-kc/ridge-saga-train.npy"
name_str2 = "./res/ridge-kc/ridge-saga-iter.npy" 
np.save(name_str1, train_res)
np.save(name_str2, iter_res)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.114856



201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.114856



201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.114856



201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.114856



201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]
201163.90222277155
[-3.57665414e+04  4.11442785e+04  2.11485657e+02  1.28597869e-01
  6.68955012e+03  5.82960458e+05  5.28709424e+04  2.63856491e+04
  9.58904452e+04 -3.02575684e+01 -6.13851528e+01 -2.62022321e+03
  1.98125837e+01 -5.82419866e+02  6.02748226e+05 -2.14729828e+05
  2.16814005e+01 -3.82641850e-01]




In [18]:
# construct coreset
def coreset_lr(m, D, y):
    # m is the number of samples, X is the data
    num_of_data, _ = D.shape
    D = np.hstack((D,y.reshape(-1,1)))
    #s = np.sum(D ** 2, axis=1) + 1 / num_of_data
    D1 = D[:,:10]
    D2 = D[:,10:]
    q1, _ = np.linalg.qr(D1)
    q2, _ = np.linalg.qr(D2)
    Q = np.hstack((q1,q2))
    
    s = np.sum(Q ** 2, axis=1) + 1/num_of_data
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D = np.hstack((D, (1/s).reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-2]
    label = C[:,-2]
    weight = C[:,-1]
    weight = weight / np.sum(weight) * m * 10
    return data, label, weight

def uniform_lr(m,D,y):
    D = np.hstack((D, y.reshape(-1,1)))
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    data = C[:,:-1]
    label = C[:,-1]
    return data, label

In [5]:
num_of_rep = 20
size_list = [100,200,300,400,500,600]

linearReg = Ridge(alpha=0)

for size in size_list:
    print("coreset size %d" % size)
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X,y)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X)
        total_cost_train = np.sum((pred_train - y) ** 2) / 21613
        print(np.sqrt(total_cost_train))
    
        train_res[t] = total_cost_train
    
    name_str1 = "./res/ridge-kc/ridge-centralized-c"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge-kc/ridge-centralized-c"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

coreset size 100
221643.44142524916
217103.78929795086
257638.15216622423
213410.36542928076
234552.10976212317
215558.89474536703
364529.74499437265
224237.43873772313
216494.83310207346
337349.75950224046
211562.9123392235
219869.63115757573
226528.9895441757
436760.121672816


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


213788.20559013216
212171.92319213037
218397.44307342975
210635.22294843823
215139.80703984437
214045.9742091725
coreset size 200
217303.61328352802
207393.59496934843
211038.1377542372
210125.50518242441
217528.38447596921
266649.15921961743
223680.42455081202


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


209370.35146286778
628725.2554260502
206955.0826316451
211530.66995386258
206774.19876412934
211113.41431679862
205315.3033831794
218738.00665800212
240609.31327816859
392833.57682881143
208257.6016723063
216482.36314138217
209690.0619763063
coreset size 300
207373.63460605594
205349.5496049335


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


205291.24836460143
235654.2969345817
205663.90370386004
647570.8339766387
292108.11189939047
206864.79714840767
208217.5726376269
230841.47607861506
263132.10089874617
274509.0327148769
204408.66581127926
206232.40958632657
212486.72806228462
204012.96183187416


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


203310.24355836512
241953.77834764565
209311.897834381
219713.10316764796
coreset size 400
208086.67604068122
202914.6309722977
231465.10905511733
219180.11803308662
203878.25081683722
204972.53689955274
208962.87146751722
207577.3954263075
204273.14832967098
204297.9647802523
205492.84544351752
233417.84503849296


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


210770.73862765593
205967.8461561372
218241.8989281865
205499.67510681524
208877.186377993
220124.73880417127
205130.33148619757
208296.32152364572
coreset size 500
203834.14449295634
203536.1174332792
263114.5260207034
202489.04880066894
212568.94802026483
204615.1512295914


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


203749.4841672589
204372.38714693498
204944.45138143413
204318.28109522435
206796.75335269177
203981.3188584048
204318.8273445287
204056.1573645538
204508.52986783572
224107.20531328177
202997.41330925806
203421.2993615102
218034.35375024943
205102.69715853865


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


coreset size 600
211089.50196508298
211200.2502172337
694028.8515175155
203011.98251653646
205940.3184476533
204400.5832619917
202394.0876617639
203511.25957581954
203500.83484170868
203823.6084886296
275111.24130047834
202422.35206173526
244776.38954170467
203782.51264145935
202748.55367349912
203573.50551075517
203359.35376220965
203636.43496479277
203688.72345424732
203199.25876634396


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


In [12]:
num_of_rep = 20
size_list = [100,200,300,400,500,600]

linearReg = Ridge(alpha=0)

for size in size_list:
    print("uniform size %d" % size)
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X,y)
        linearReg.fit(uniform_X, uniform_y)

        pred_train = linearReg.predict(X)
        total_cost_train = np.sum((pred_train - y) ** 2) / 21613
        print(np.sqrt(total_cost_train))
        
        train_res[t] = total_cost_train
    
    name_str1 = "./res/ridge-kc/ridge-centralized-u"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge-kc/ridge-centralized-u"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

uniform size 100
567709.8982827796
4.3490563406740966e+17
7.805914902076826e+16
1.3110582501880712e+17
265499.4699381084
1.1075525962936413e+17
2.8197699333618636e+16
1.3020884407414656e+17
239042.66637366105
8.569420239960059e+16
1.6972529118035373e+17
3.822847938526699e+17
6.423210593594314e+17
250594.8846000923
1794016.0377770637
1.0991022521738371e+17
224704.96709866132
2.921570963603242e+17
417526.8799298856
230226.57705875763
uniform size 200
288927.82454076817
3.2984960548275604e+16
222994.8677515407
255694.90582906659
337525.4682838815
417516.96555332333
3.039110112267278e+16
219067.49351884032
3.630293737856239e+16
1.1263691665613272e+17
234551.17102216266
217956.09312821712
212996.7533036935
218062.48898356486
868224.5908467099
6.698002728432956e+16
497665.460010742
234593.57054880116
1.4989947291776864e+16
1.6091002715775404e+16
uniform size 300
214805.5453463354
436390.6091616176
208405.04498370204
228559.28583916134
246143.43115364725
220256.67794376702
206978.13926467372


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

213064.17905999464
242096.2271507037
213359.12541697075
208397.27483995425
210981.4671253441
226483.6311503213
216113.764354039
207513.799337847
210662.37981168253
211593.89663429183
210299.0756500316
211304.46094566907
295033.53167524526
1636502.0686004967
210395.79055341735
213324.97457972536
221701.92754539681
1.2974227077193846e+16
uniform size 500
229276.65865322566
209780.54107739494
210272.7715322208
229943.69814423684
208089.27875072564
205917.52451904013
207930.49356550653
205482.18581484313
206319.9093682521
220312.09701017945
465055.0700867199
205731.6779558191
209529.57944874046
211056.66908453632
211057.50805421476
294881.3344131439
204879.53725716876
313261.9105326377
214581.9674627714
314128.4527535466
uniform size 600
208792.6261797574
208653.77476384322
207475.1650424191
203981.5263391046
221179.5378652096
337156.98757210746
227558.93136924307
208323.75070505906
259949.12654628896
216539.84799633667
209358.72798258901
250386.72603311072
243202.26394557476
231207.306808

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T
  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=

In [25]:
num_of_rep = 20
size_list = [100,200,300,400,500,600]

new_lr = Ridge(alpha=0,solver='saga',max_iter=3000,tol=1e-6)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X,y)
        
        new_lr.fit(coreset_X, coreset_y, weight)

        pred_train = new_lr.predict(X)
        total_cost_train = np.sum((pred_train - y) ** 2) / 21613

        train_res[t] = total_cost_train
        iter_res[t] = new_lr.n_iter_[0]


    name_str1 = "./res/ridge-kc/ridge-saga-c"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge-kc/ridge-saga-c"+str(size)+"s-test.npy" 
    name_str3 = "./res/ridge-kc/ridge-saga-c"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)







In [26]:
num_of_rep = 20
size_list = [100,200,300,400,500,600]

new_lr = Ridge(alpha=0,solver='saga',max_iter=3000,tol=1e-6)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X,y)

        new_lr.fit(uniform_X, uniform_y)

        pred_train = new_lr.predict(X)
        total_cost_train = np.sum((pred_train - y) ** 2) / 21613

        
        train_res[t] = total_cost_train
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/ridge-kc/ridge-saga-u"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge-kc/ridge-saga-u"+str(size)+"s-test.npy" 
    name_str3 = "./res/ridge-kc/ridge-saga-u"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)





