# PCA and polynomial experimentation

In [None]:
# standard libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# own functions
from proj1_helpers import *
import basic_functions as bf
import cross_validation as cv
#import plot_functions as pf 

#constants
train_path = 'train.csv'
test_path = 'test.csv'

In [None]:
#w_ridge, rmse_ridge=      bf.ridge_regression(y_tr, x_tr, lambda_)

#loss_reglogreg, w_reglogreg, losses_reglogreg=    bf.reg_logistic_regression(y_tr, tx_tr, lambda_, initial_w, max_iters, gamma)


#*args = hyperparameters 

def cross_validation(function_to_run, y, x, num_of_k_fold, *args):
    
    losses = []
    pred_acc_percents = []
    
    k_indices = cv.build_k_indices(y, num_of_k_fold, 1)
    
    for k in range(num_of_k_fold):
        
        x_test, y_test, x_tr, y_tr = cv.split_k(x,y,k_indices, k)
        
        if(function_to_run.__name__ == "reg_logistic_regression"):
            
            lambda_ = args[0]
            initial_w = args[1]
            max_iters = args[2]
            gamma = args[3]
            
            loss, weights, losses =  function_to_run(y_tr, x_tr, lambda_, initial_w, max_iters, gamma) 
            
        elif(function_to_run.__name__ == "ridge_regression"):
            
            lambda_ = args[0]
            
            weights, loss = bf.ridge_regression(y, x, lambda_)
        
        elif(function_to_run.__name__ == "least_squares"):
            
            ty = np.transpose(y)
            
            loss, weights = bf.least_squares(ty, x)
            
            
        losses.append(loss)

        pred_y = bf.log_pred(x, weights)    
        pred_acc_percent, soppel = bf.log_pred_acc(y, pred_y)
        pred_acc_percents.append(pred_acc_percent)
     
    
    loss_sum = 0
    for loss in losses:
        loss_sum += loss
    avg_loss = loss_sum / len(losses)
    
    acc_sum = 0
    for acc in pred_acc_percents:
        acc_sum += acc
    avg_acc = acc_sum / len(pred_acc_percents)
    
    return avg_loss, losses, avg_acc, pred_acc_percents 


lambda_=0.000000000001
initial_w = np.zeros((x.shape[1], 1))
max_iters = 5
gamma = 0.01
k_folds = 5

#avg_loss, losses, avg_preds, pred_acc_percents = cross_validation(bf.ridge_regression, y, phi, k_folds, lambda_)
#avg_loss, losses, avg_preds, pred_acc_percents = cross_validation(bf.least_squares, y, x, k_folds)
#avg_loss, losses, avg_preds, pred_acc_percents = cross_validation(bf.reg_logistic_regression, y, phi, k_folds, lambda_, initial_w, max_iters, gamma)

print("avg loss: ", avg_loss)
print("Losses: ", losses)
print("Average prediction NON-accuracy: ", avg_preds)
print("prediction NON-accuracy percents: ", pred_acc_percents)

In [None]:
def pca(data, pc_count = None):
    """
    Principal component analysis using eigenvalues
    note: this mean-centers and auto-scales the data (in-place)
    """
    data -= np.mean(data, 0)
    data /= np.std(data, 0)
    
    """
    Covariance matrix
    note: specifically for mean-centered data
    note: numpy's `cov` uses N-1 as normalization
    """
    C = np.dot(data.T, data) / data.shape[0]
    E, V = np.linalg.eigh(C)
    key = np.argsort(E)[::-1][:pc_count]
    E, V = E[key], V[:, key]
    U = np.dot(data, V)  # used to be dot(V.T, data.T).T
    return U, E, V

In [None]:
#Importing data: 
y, x, ids = load_csv_data(train_path, sub_sample=True) #remember to switch of subsample when running it "for real"
pred_y, pred_x, pred_ids = load_csv_data(test_path, sub_sample=True)

In [None]:
seed = 1
ratio = 0.90

In [None]:
data = np.copy(x)
degrees=(1,2,3,4,5,6)#,7,8,9,10,11,12,13,14)
lambdas=np.logspace(-8,3,11)
rmses_tr=np.zeros((len(degrees),len(lambdas)))
rmses_te=np.zeros((len(degrees),len(lambdas)))

# Using PCA on the data before split so that the train and testset get the same eigenvectors
data = pca(data, 30)[0]

for d, degree in enumerate(degrees): 
    
    #bulding polynomial
    phi=bf.build_poly(data, degree)
    print(phi.shape)
    
    #splitting data
    x_tr, x_te, y_tr, y_te = bf.split_data(phi, y, ratio, seed)
    
    #Normalizing data
    #x_tr=bf.normalize(x_tr)
    #x_te=bf.normalize(x_te)
    
    for l, lambda_ in enumerate(lambdas):
        #preforming ridge regression
        w, rmse_tr=bf.ridge_regression(y_tr, x_tr, lambda_)
        rmses_tr[d,l]=rmse_tr
        rmse_te=np.sqrt(2*bf.get_mse(y_te, x_te, w))
        rmses_te[d,l]=rmse_te

In [None]:
plt.plot(degrees,rmses_tr[:,0],color='b', marker='*', label="Train error 1")
plt.plot(degrees,rmses_tr[:,1],color='r', marker='*', label="Train error 2")
plt.plot(degrees,rmses_tr[:,2],color='g', marker='*', label="Train error 3")
plt.plot(degrees,rmses_tr[:,3],color='m', marker='*', label="Train error 4")
plt.plot(degrees,rmses_tr[:,4],color='y', marker='*', label="Train error 5")
plt.plot(degrees,rmses_tr[:,5],color='b', marker='*', label="Train error 6")
plt.plot(degrees,rmses_tr[:,6],color='r', marker='*', label="Train error 7")
plt.plot(degrees,rmses_tr[:,7],color='g', marker='*', label="Train error 8")
plt.plot(degrees,rmses_tr[:,8],color='m', marker='*', label="Train error 9")
plt.plot(degrees,rmses_tr[:,9],color='m', marker='*', label="Train error 10")
leg = plt.legend(loc=1, shadow=True)

In [None]:
plt.plot(degrees,rmses_te[:,0],color='b', marker='*', label="Test error 1")
plt.plot(degrees,rmses_te[:,1],color='r', marker='*', label="Test error 2")
plt.plot(degrees,rmses_te[:,2],color='g', marker='*', label="Test error 3")
plt.plot(degrees,rmses_te[:,3],color='m', marker='*', label="Test error 4")
plt.plot(degrees,rmses_te[:,4],color='y', marker='*', label="Test error 5")
plt.plot(degrees,rmses_te[:,5],color='b', marker='*', label="Test error 6")
plt.plot(degrees,rmses_te[:,6],color='r', marker='*', label="Test error 7")
plt.plot(degrees,rmses_te[:,7],color='g', marker='*', label="Test error 8")
plt.plot(degrees,rmses_te[:,8],color='m', marker='*', label="Test error 9")
plt.plot(degrees,rmses_te[:,9],color='m', marker='*', label="Test error 10")
leg = plt.legend(loc=1, shadow=True)

In [None]:
plt.plot(degrees,rmses_te[:,0],color='b', marker='*', label="Test error 1")
plt.plot(degrees,rmses_tr[:,0],color='r', marker='*', label="Train error 1")
leg = plt.legend(loc=1, shadow=True)

In [None]:
plt.plot(degrees,rmses_tr[:,0],color='b', marker='*', label="Train error 1")
plt.plot(degrees,rmses_tr[:,1],color='b', marker='*', label="Train error 2")
plt.plot(degrees,rmses_tr[:,2],color='b', marker='*', label="Train error 3")
plt.plot(degrees,rmses_tr[:,3],color='b', marker='*', label="Train error 4")
plt.plot(degrees,rmses_tr[:,4],color='b', marker='*', label="Train error 5")
plt.plot(degrees,rmses_te[:,0],color='r', marker='*', label="Test error 1")
plt.plot(degrees,rmses_te[:,1],color='y', marker='*', label="Test error 2")
plt.plot(degrees,rmses_te[:,2],color='g', marker='*', label="Test error 3")
plt.plot(degrees,rmses_te[:,3],color='m', marker='*', label="Test error 4")
plt.plot(degrees,rmses_te[:,4],color='black', marker='*', label="Test error 5")
leg = plt.legend(loc=1, shadow=True)

In [None]:
plt.plot(degrees,rmses_tr[:,5],color='grey', marker='*', label="Train error 6")
plt.plot(degrees,rmses_tr[:,6],color='b', marker='*', label="Train error 7")
plt.plot(degrees,rmses_tr[:,7],color='b', marker='*', label="Train error 8")
plt.plot(degrees,rmses_tr[:,8],color='b', marker='*', label="Train error 9")
plt.plot(degrees,rmses_tr[:,9],color='b', marker='*', label="Train error 10")
plt.plot(degrees,rmses_te[:,5],color='r', marker='*', label="Test error 6")
plt.plot(degrees,rmses_te[:,6],color='g', marker='*', label="Test error 7")
plt.plot(degrees,rmses_te[:,7],color='y', marker='*', label="Test error 8")
plt.plot(degrees,rmses_te[:,8],color='m', marker='*', label="Test error 9")
plt.plot(degrees,rmses_te[:,9],color='black', marker='*', label="Test error 10")
leg = plt.legend(loc=1, shadow=True)

In [None]:
phi=bf.build_poly(x, 5)
#splitting data
#x_tr, x_te, y_tr, y_te = bf.split_data(phi, y, ratio, seed)
x_tr=x
y_tr=y
#Normalizing data
x_tr=bf.normalize(x_tr)
#x_te=bf.normalize(x_te)
w, rmse_tr=bf.ridge_regression(y_tr, x_tr, lambdas[4])
rmse_tr_cho=rmse_tr
#rmse_te_cho=np.sqrt(2*bf.get_mse(y_te, x_te, w))


In [None]:
print(rmse_tr_cho)

In [None]:
y_pred = predict_labels(w, x_tr)
print(y_pred[1:25])
p,c=bf.log_pred_acc(y_tr,y_pred)
print(p,c)

## Do over for delivery

In [None]:
"""
Doing PCA on all of the train.csv and test.csv, because it doesn't need labels. Not following Ma chine learning
honor code
"""
#Importing data: 
y, x, ids = load_csv_data(train_path, sub_sample=False) #remember to switch of subsample when running it "for real"
pred_y, pred_x, pred_ids = load_csv_data(test_path, sub_sample=False)

In [None]:
# combining train.csv and test.csv
ALL_THE_DATA = np.vstack((x, pred_x))
print(x.shape, pred_x.shape)
print(ALL_THE_DATA.shape)

# Using PCA on the data before split so that the train and testset get the same eigenvectors
data = pca(ALL_THE_DATA, 30)[0]

#bulding polynomial
degree = 6
phi=bf.build_poly(data, degree)

# Regression to find weights in just the train.csv data
# assuming that the datapoints have not been abstracted but only 
split = x.shape[0]
print(split)
weights, loss = bf.ridge_regression(y, phi[:split], lambda_)
print(weights.shape)



In [None]:
# testing
lambda_=0.00000000000001
#initial_w = np.zeros((x.shape[1], 1))
#max_iters = 5
#gamma = 0.01
k_folds = 5

avg_loss, losses, avg_preds, pred_acc_percents = cross_validation(bf.ridge_regression, y, phi[:split], k_folds, lambda_)
#avg_loss, losses, avg_preds, pred_acc_percents = cross_validation(bf.least_squares, y, x, k_folds)
#avg_loss, losses, avg_preds, pred_acc_percents = cross_validation(bf.reg_logistic_regression, y, phi, k_folds, lambda_, initial_w, max_iters, gamma)

print("avg loss: ", avg_loss)
print("Losses: ", losses)
print("Average prediction NON-accuracy: ", avg_preds)
print("prediction NON-accuracy percents: ", pred_acc_percents)

In [None]:
# 
print(phi[:split].shape)
y_pred = predict_labels(weights, phi[split:])
print(y_pred.shape)
print(pred_ids.shape)
print(y_pred.mean())
print(y.mean())

In [None]:
name = 'pca_poly2.csv'
create_csv_submission(pred_ids, y_pred, name)