# LEAST SQUARES POLYNOMIAL IMPLEMENTATION

# Load data

In [1]:
import sys
my_path = r'C:\Users\utente\Documents\GitHub\Project1_ML'
sys.path.insert(0,my_path + r'\code\COMMON')

# import external modules
import numpy as np
import matplotlib.pyplot as plt

# import internal modules
from proj1_helpers import *
from implementations import *
from standard import standardize

In [2]:
yb, input_data, ids = load_csv_data(my_path + r'\data\train.csv', sub_sample=False)

print("Data loaded! Shape: ")
print(np.shape(input_data))

Data loaded! Shape: 
(250000, 30)


# Remove -999 or substitution with mean

In [3]:
X = input_data
nine_count=[]

for i in range(0, np.shape(input_data)[1],1):
    nine_count.append(np.shape(np.where((input_data.T)[i]==-999))[1])
    
print(nine_count)

[38114, 0, 0, 0, 177457, 177457, 177457, 0, 0, 0, 0, 0, 177457, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 99913, 99913, 99913, 177457, 177457, 177457, 0]


In [4]:
DELETE = 1;
MEAN_SUB = 0;

delete_method = DELETE

if delete_method:
    Y = yb
    for i in range(input_data.shape[1]):
        del_idx = np.where(X[:,i] == -999)
        X = np.delete(X, del_idx, 0)      
        Y = np.delete(Y, del_idx, 0)
    print("-999 Deleted! Shape:")
    
else:
    for i in range(input_data.shape[1]):
        X = np.delete(X, np.where(X[:,i] == -999), 0)

    means = np.mean(X, axis=0)

    for i in range(input_data.shape[1]):
        input_data[np.where(input_data[:,i]==-999),i] = means[i]
    X = input_data
    Y = yb
    
    print("-999 Substituted with the mean! Shape")
    
print(np.shape(X))

-999 Deleted! Shape:
(68114, 30)


In [5]:
nine_count=[]

for i in range(0, np.shape(X)[1],1):
    nine_count.append(np.shape(np.where((X.T)[i]==-999))[1])

print(nine_count)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# Standardization

In [6]:
Xstd, mean_X, std_X = standardize(X)    
Xstd.shape

X = Xstd

# Build_poly

In [7]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    
    X = np.vander((x[:,0]).T, degree+1, increasing=True)
    
    for i in range(1,np.shape(x)[1],1):
        feat = (x[:,i]).T
        vander = np.vander(feat, degree+1, increasing=True)
        #remove the column of 1 at the beginning of each vander
        vander = np.delete(vander, 0,axis = 1)
        #concatenation
        X = np.concatenate((X, vander), axis=1)
    
    return X

Build_poly testing

In [8]:
A = np.array([[2, 3, 4], [5, 6, 7]])
b = np.array([[2,3]])

print(b)

build_poly(A, 3)


[[2 3]]


array([[  1,   2,   4,   8,   3,   9,  27,   4,  16,  64],
       [  1,   5,  25, 125,   6,  36, 216,   7,  49, 343]])

# Least Squares

In [9]:
w, loss = least_squares(Y, X)

In [10]:
w

array([ -1.17640522e-01,  -5.52123243e-01,  -3.66659340e-01,
         4.28408519e-01,  -9.39054796e-01,   6.92131844e-02,
         4.44975426e-01,   4.94896674e+01,  -2.95168677e-01,
         3.40608586e+01,  -2.32999113e+01,   1.64630980e+01,
         5.24609139e+01,  -3.34666729e+01,  -5.23000787e-01,
         3.45788920e-02,  -3.30479148e+01,  -6.94110475e-01,
         2.78263357e-01,   1.68820197e-01,   4.16889440e-01,
        -1.20236231e-01,  -2.64256662e+01,  -3.99506855e-01,
         4.40256269e-01,   4.73198246e-01,   5.66094975e-02,
         3.38788428e-01,  -2.30033136e-01,  -3.39704798e+01])

In [11]:
loss

0.36788093286707663

In [12]:
y_model = np.sign(X.dot(w))

In [13]:
np.shape(y_model)

(68114,)

In [14]:
pos = 0
neg = 0
for i in range (len(Y)):
    if Y[i] == y_model[i]:
        pos += 1
    else:
        neg += 1
print(pos)
print(neg)

49439
18675


In [15]:
success_rate = pos/(pos+neg)
print(success_rate)

0.7258272895439998


# Least Squares Poly

In [9]:
X = build_poly(X, 3)
print(X)

[[  1.00000000e+00   5.65377932e-01   3.19652207e-01 ...,   3.79160453e-01
    1.43762649e-01   5.45091112e-02]
 [  1.00000000e+00   2.02040212e-01   4.08202471e-02 ...,   9.76916101e-01
    9.54365069e-01   9.32334603e-01]
 [  1.00000000e+00   6.42063175e-01   4.12245121e-01 ...,   8.74139682e-01
    7.64120184e-01   6.67947775e-01]
 ..., 
 [  1.00000000e+00   4.73518199e-01   2.24219485e-01 ...,   9.72732863e-01
    9.46209223e-01   9.20408807e-01]
 [  1.00000000e+00   5.02778495e-01   2.52786216e-01 ...,   3.60472041e+00
    1.29940093e+01   4.68397704e+01]
 [  1.00000000e+00   1.15110584e+00   1.32504465e+00 ...,   8.31628736e-01
    6.91606355e-01   5.75159719e-01]]


In [10]:
w, loss = least_squares(Y, X)

In [11]:
np.shape(w)

(91,)

In [12]:
y_model = np.sign(X.dot(w))
np.shape(y_model)

(68114,)

In [13]:
pos = 0
neg = 0
for i in range (len(Y)):
    if Y[i] == y_model[i]:
        pos += 1
    else:
        neg += 1
print(pos)
print(neg)

success_rate = pos/(pos+neg)
print(success_rate)

53700
14414
0.78838417946384


# Load data for the test

In [14]:
nope, test_data, ids = load_csv_data(my_path + r'\data\test.csv', sub_sample=False)

print("Data loaded! Shape: ")
print(np.shape(test_data))

Data loaded! Shape: 
(568238, 30)


In [15]:
test_X = build_poly(test_data, 3)

y_predictions = np.sign(test_X.dot(w))
np.shape(y_predictions)

(568238,)

In [16]:
y_predictions

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [17]:
create_csv_submission(ids, y_predictions, "third_sub")