In [107]:
import numpy as np
import scipy as sp
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

In [108]:
# reading data
data = pd.read_csv("year_prediction.csv")
data = data.rename(index=str, columns={"label":"year"})

In [111]:
# separate input attributes and output into different dataframes
X = data.iloc[:,1:]
Y = data.iloc[:,0]

# Train set
X_train = X.iloc[0:463715,:]
y_train = Y.iloc[0:463715]

# Validation set
X_test = X.iloc[463715:,:]
y_test = Y.iloc[463715:]

In [112]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

# if we want to standarize the training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [256]:
num_of_rep = 20

linearReg = Ridge(alpha=46371.5)

train_res = np.zeros(num_of_rep)
test_res = np.zeros(num_of_rep)

for t in range(num_of_rep):
    linearReg.fit(X_train, y_train)
    pred_train = linearReg.predict(X_train)
    total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)
    print(total_cost_train)

    pred_test = linearReg.predict(X_test)
    total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    print(total_cost_test)
    
    train_res[t] = total_cost_train
    test_res[t] = total_cost_test
    
name_str1 = "./res/ridge/ridge-centralized-train.npy"
name_str2 = "./res/ridge/ridge-centralized-test.npy" 
np.save(name_str1, train_res)
np.save(name_str2, test_res)

91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518
91.34207242375507
90.44726168835518


In [182]:
def gd(X, y, lamb = 0.1, eta = 0.1, T = 1000, gamma = 0.99):
    num_data, d = X.shape
    y = y.reshape(-1,1)
    m = np.zeros((d,1))
    res = np.zeros((d,1))
    for t in range(T):
        #print(res)
        if t % 10000 == 0:
            print(t)
        grad = X.T @ X @ res / num_data - X.T @ y / num_data + lamb * res
        m = gamma * m + (1-gamma) * grad
        res = res - eta * m
    return res

In [150]:
res = gd(X_train_std, y_train)

pred_train = (X_train_std @ res).reshape(-1)
total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + np.sum((res.reshape(-1)) ** 2)
print(total_cost_train)

pred_test = (X_test_std @ res).reshape(-1)
total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
print(total_cost_test)

0
100
200
300
400
500
600
700
800
900
3993671.583046587
3993751.2675514827


In [221]:
# construct coreset
def coreset_lr(m, D, y):
    # m is the number of samples, X is the data
    num_of_data, _ = D.shape
    D = np.hstack((D,y.reshape(-1,1)))
    #s = np.sum(D ** 2, axis=1) + 1 / num_of_data
    D1 = D[:,:30]
    D2 = D[:,30:60]
    D3 = D[:,60:]
    q1, _ = np.linalg.qr(D1)
    q2, _ = np.linalg.qr(D2)
    q3, _ = np.linalg.qr(D2)
    Q = np.hstack((q1,q2,q3))
    
    s = np.sum(Q ** 2, axis=1) + 1/num_of_data
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D = np.hstack((D, (1/s).reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False, weights=s)
    C = C.to_numpy()
    data = C[:,:-2]
    label = C[:,-2]
    weight = C[:,-1]
    weight = weight / np.sum(weight) * m * 10
    return data, label, weight

def uniform_lr(m,D,y):
    D = np.hstack((D, y.reshape(-1,1)))
    #D = np.hstack((D, y.to_numpy().reshape(-1,1)))
    D_df = pd.DataFrame(D)
    C = D_df.sample(n=m, replace=False)
    C = C.to_numpy()
    data = C[:,:-1]
    label = C[:,-1]
    return data, label

In [250]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

linearReg = Ridge(alpha=46371.5)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)
        linearReg.fit(coreset_X, coreset_y, weight)

        #coreset_X, coreset_y, _ = coreset_lr(5000,X_train,y_train)
        #linearReg.fit(coreset_X, coreset_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
    
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/ridge/ridge-centralized-c"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-centralized-c"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [257]:
num_of_rep = 20
size_list = [1200]

linearReg = Ridge(alpha=46371.5)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)
        linearReg.fit(uniform_X, uniform_y)

        pred_train = linearReg.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = linearReg.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
    
    name_str1 = "./res/ridge/ridge-centralized-u"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-centralized-u"+str(size)+"s-test.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)

In [258]:
num_of_rep = 20
size_list = [1000]

new_lr = Ridge(alpha=46371.5,solver='saga',max_iter=10000,tol=1e-4)

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):
        coreset_X, coreset_y, weight = coreset_lr(size,X_train,y_train)

        new_lr.fit(coreset_X, coreset_y, weight)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630
        
        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/ridge/ridge-saga-c"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-saga-c"+str(size)+"s-test.npy" 
    name_str3 = "./res/ridge/ridge-saga-c"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)

In [261]:
num_of_rep = 20
size_list = [1000,2000,3000,4000,5000,6000]

for size in size_list:
    train_res = np.zeros(num_of_rep)
    test_res = np.zeros(num_of_rep)
    iter_res = np.zeros(num_of_rep)
    for t in range(num_of_rep):

        uniform_X, uniform_y = uniform_lr(size,X_train,y_train)

        new_lr = Ridge(alpha=46371.5,solver='saga',max_iter=10000,tol=8e-5)
        new_lr.fit(uniform_X, uniform_y)

        pred_train = new_lr.predict(X_train)
        total_cost_train = np.sum((pred_train - y_train) ** 2) / 463715 + 0.1 * np.sum((linearReg.coef_) ** 2)

        pred_test = new_lr.predict(X_test)
        total_cost_test = np.sum((pred_test - y_test) ** 2) / 51630

        train_res[t] = total_cost_train
        test_res[t] = total_cost_test
        iter_res[t] = new_lr.n_iter_[0]

    name_str1 = "./res/ridge/ridge-saga-u"+str(size)+"s-train.npy"
    name_str2 = "./res/ridge/ridge-saga-u"+str(size)+"s-test.npy" 
    name_str3 = "./res/ridge/ridge-saga-u"+str(size)+"s-iter.npy" 
    np.save(name_str1, train_res)
    np.save(name_str2, test_res)
    np.save(name_str3, iter_res)



In [286]:
a = 1
for i in range(1000):
    a *= (500000-i) / 500000
print(a)

0.3680022108329327
