### Download data

We will download data from NYU box and extract it to local disk. It may take a couple of minutes

In [None]:
str_path = "./weather/"

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from os import listdir
from sklearn import linear_model

np.random.seed(2021)

In [3]:
def extract_temp(file_name,col_ind):
    data_aux = np.loadtxt(file_name, usecols=range(10))
    data = data_aux[:,col_ind]
    err_count = 0
    ind_errs = []
    for ind in range(data.shape[0]):
        if data[ind] > 100 or data[ind] < -100:
            err_count = err_count + 1
            ind_errs.append(ind)
            data[ind] = data[ind-1]  
    print("File name: " + file_name)
    print("Errors: " + str(err_count) + " Indices: " + str(ind_errs))
    return data

def create_data_matrix(str_path):
    file_name_list = listdir(str_path)
    file_name_list.sort()
    col_ind = 8 # 8 = last 5 minutes, 9 = average over the whole hour
    data_matrix = []
    ind = 0
    for file_name in file_name_list:
        if file_name[0] == '.':
            continue
        else:
            print("Station " + str(ind))
            ind = ind + 1
            data_aux = extract_temp(str_path + file_name,col_ind)
            if len(data_matrix) == 0:
                data_matrix = data_aux
            else:
                data_matrix = np.vstack((data_matrix,data_aux))
    return data_matrix.T

In [4]:
load_files = False
if load_files:
    str_path_2015 = str_path + "hourly/2015/"
    data_matrix = create_data_matrix(str_path_2015)
else:
    data_matrix = np.load(str_path +"hourly_temperature_2015.npy")

file_name_list = listdir(str_path + "hourly/2015/")
file_name_list.sort()

### prepare data for training

In [5]:
ind_response = 18
print("Response is " + str(file_name_list[ind_response]))
y_raw = data_matrix[:,ind_response]
ind_X = np.hstack((np.arange(0,ind_response),np.arange(ind_response+1,data_matrix.shape[1])))
X_raw = data_matrix[:,ind_X]
n_features = X_raw.shape[1]

Response is CRNH0203-2015-AL_Valley_Head_1_SSW.txt


In [6]:
n_test = int(1e3)
n_val = int(1e2)
n_train = data_matrix.shape[0] - n_test - n_val

In [7]:
aux_ind = np.random.permutation(range(data_matrix.shape[0]))
ind_test = aux_ind[:n_test]
ind_val = aux_ind[n_test:(n_test+n_val)]
X_test = X_raw[ind_test,:]
y_test = y_raw[ind_test]
X_val = X_raw[ind_val,:]
y_val = y_raw[ind_val]
ind_train = aux_ind[(n_test+n_val):int(n_test+n_val+n_train)]
X_train = X_raw[ind_train,:]
y_train = y_raw[ind_train]

For this problem we will work with features that are zero mean and unit variance. We standardize the data below. Make sure to standardize the validation and test data using the statistics you compute from train data

In [8]:
center_vec = X_train.mean(axis=0)
X_train_centered = X_train - center_vec
col_norms = np.linalg.norm(X_train_centered, axis=0) / np.sqrt(n_train)
X_train_norm = np.true_divide(X_train_centered, col_norms)

y_train_center = y_train.mean()
y_train_centered = y_train - y_train_center
norm_y_train = np.linalg.norm(y_train_centered) / np.sqrt(n_train)
y_train_norm = y_train_centered / norm_y_train

#### 4(c)

In [11]:
# print the error value achieved on validation set.

1.2136898851357356

### 4(d) SGD

In [1]:
def stochastic_grad_descent(X, y, alpha=0.05, num_epoch=1000):
    """
    Args:
        X - the feature vector, 2D numpy array of size (num_instances, num_features)
        y - the label vector, 1D numpy array of size (num_instances)
        alpha - string or float, step size in gradient descent
                NOTE: In SGD, it's not a good idea to use a fixed step size. Usually it's set to 1/sqrt(t) or 1/t
                if alpha is a float, then the step size in every step is the float.
                if alpha == "1/sqrt(t)", alpha = 1/sqrt(t).
                if alpha == "1/t", alpha = 1/t.
        num_epoch - number of epochs to go through the whole training set
    """

In [2]:
learning_rates = [0.0005, 0.005, 0.05, "1/sqrt(t)", "1/t"]

In [3]:
# graphs of validation and training error with epoch for different learning rates

### 4(e) 

In [4]:
# compare the test loss for SGD and 4(c)