In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
from implementations import *

In [3]:
train_data = 'all/train.csv'

In [4]:
yb, input_data, ids, features = load_csv_data(train_data)

In [5]:
def remove_undefined_features(input_data, features, threshold):
    """ Removes features with an undefined ratio above threshold """
    N = input_data.shape[0]
    
    undefined_ratio = []
    for i, feature_name in enumerate(features):
        feature_column = input_data[:,i]
        defined_indices = (feature_column == -999)
        undefined_ratio.append(defined_indices.sum() / N)
        
    clean_feature_indices = (np.array(undefined_ratio) < threshold)
    new_features = features[clean_feature_indices]
    new_data = input_data[:,clean_feature_indices]
    
    return new_features, new_data

In [6]:
def remove_undefined_samples(labels, input_data):
    """ Removes samples with undefined features """
    clean_data_indices = np.all(input_data != -999,axis = 1)
    new_labels = labels[clean_data_indices]
    new_data = input_data[clean_data_indices]
    
    return new_labels, new_data

In [7]:
def extend_and_standardize(input_data):
    x, mean_x, std_x = standardize(input_data)
    tx = build_model_data(x)
    return tx

### No standardization 

In [30]:
x_tr, x_te, y_tr, y_te = split_data(input_data, yb, 0.8)

In [31]:
w_least_squares = least_squares(y_tr, x_tr)

In [32]:
error(y_te,x_te,w_least_squares)

0.25598

In [33]:
initial_w = np.zeros(input_data.shape[1])
max_iters = 1000
gamma = 0.01

In [34]:
loss, w_logistic = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)

Current iteration=0, loss=0
Current iteration=100, loss=0
Current iteration=200, loss=0
Current iteration=300, loss=0
Current iteration=400, loss=0
Current iteration=500, loss=0
Current iteration=600, loss=0
Current iteration=700, loss=0
Current iteration=800, loss=0
Current iteration=900, loss=0


In [35]:
error(y_te,x_te,w_logistic)

0.33786

### With standardization and extension 

In [12]:
import datetime

In [8]:
tx = extend_and_standardize(input_data)

In [9]:
x_tr, x_te, y_tr, y_te = split_data(tx, yb, 0.8)

In [14]:
start_time = datetime.datetime.now()
w_least_squares = least_squares(y_tr, x_tr)
end_time = datetime.datetime.now()
print((end_time-start_time).total_seconds())

0.017176


In [39]:
error(y_te,x_te,w_least_squares)

0.25532

In [40]:
initial_w = np.zeros(tx.shape[1])
max_iters = 1000
gamma = 0.01

In [41]:
loss, w_logistic = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)

Current iteration=0, loss=0
Current iteration=100, loss=0
Current iteration=200, loss=0
Current iteration=300, loss=0
Current iteration=400, loss=0
Current iteration=500, loss=0
Current iteration=600, loss=0
Current iteration=700, loss=0
Current iteration=800, loss=0
Current iteration=900, loss=0


In [42]:
error(y_te,x_te,w_logistic)

0.31244

### With Data Cleaning

In [43]:
threshold = 0.7

In [44]:
new_features, new_data = remove_undefined_features(input_data,features,threshold)

In [45]:
cleaned_tx = extend_and_standardize(new_data)

In [46]:
x_tr, x_te, y_tr, y_te = split_data(cleaned_tx, yb, 0.8)

In [47]:
w_least_squares = least_squares(y_tr, x_tr)

In [48]:
error(y_te,x_te,w_least_squares)

0.2659

In [49]:
initial_w = np.zeros(cleaned_tx.shape[1])
max_iters = 1000
gamma = 0.01

In [50]:
loss, w_logistic = logistic_regression(y_tr, x_tr, initial_w, max_iters, gamma)

Current iteration=0, loss=0
Current iteration=100, loss=0
Current iteration=200, loss=0
Current iteration=300, loss=0
Current iteration=400, loss=0
Current iteration=500, loss=0
Current iteration=600, loss=0
Current iteration=700, loss=0
Current iteration=800, loss=0
Current iteration=900, loss=0


In [51]:
error(y_te,x_te,w_logistic)

0.3486

### Ridge Regression

In [16]:
tx = extend_and_standardize(input_data)

In [17]:
x_tr, x_te, y_tr, y_te = split_data(tx, yb, 0.8)

In [22]:
seed = 1
k_fold = 4
k_indices = build_k_indices(y_tr, k_fold, seed)
lambda_ = 0.001
loss_tr, loss_te = cross_validation(y_tr,x_tr,k_indices,0,lambda_)

In [23]:
print(loss_tr)
print(loss_te)

0.3389129942252739
0.3400225879407435


In [26]:
optimal_lambda = find_optimal_lambda(y_tr,x_tr)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19


In [27]:
w_rr = ridge_regression(y_tr,x_tr,optimal_lambda)

In [28]:
error(y_te,x_te,w_rr)

0.25534

In [29]:
optimal_lambda

0.0001

In [None]:
features.size