In [158]:
#import stuff here
import math
import numpy as np
import matplotlib.pyplot as plt

In [159]:
#The file "Stand_Data.csv" generated from "Standardization.ipynb" is required
file_dir = "Stand_Data.csv"
data = np.loadtxt(file_dir, delimiter=",")
print(np.shape(data))

(39644, 58)


# PART A:
Start with a basic linear model (the features are only standardized and are used as they are
without any transformations) regardless if you are doing the least-squares data fitting (Chapter
13) or the least-squares classification (Chapter 14) task. Evaluate the initial model using cross-
validation and report the RMS error. Make sure to save the model parameters for each fold of
the cross-validation.

In [160]:
#The file 'OnlineNewsPopularity.csv' from https://archive.ics.uci.edu/dataset/332/online+news+popularity is required
file_dir2 = 'OnlineNewsPopularity.csv'
columns = [60]
target = np.loadtxt(file_dir2, delimiter = ',', skiprows=1, usecols = columns)
print(np.shape(target))

(39644,)


##Linear Model

In [161]:
#Add a column of 1's to the right side of A
A = data
A = np.vstack((A.T, np.ones(39644))).T
print(np.shape(A))

#solve least squares problem
params = np.linalg.lstsq(A, target, rcond=None)[0]

(39644, 59)


##Cross Validation

###Calculate RMS

In [162]:
def RME(x_test, y_test, params):
  preds = []
  data_size = x_test.shape[0]
  for i in range(data_size):
    pred = np.dot(x_test[i], params[:58]) + params[58]
    preds.append(pred)
  return math.sqrt((np.sum(np.square(np.subtract(y_test, preds))))/data_size)

#print(RME(data,target,params))

###Cross Validation Folds

In [163]:
#Train on 5 folds, indexes: [0-7928, 7929-15857, 15858-23786, 23787-31715, 31716-39643]
#fold 1
x_train1 = A[7929:,:]
x_test1 = data[:7979,:]
y_train1 = target[7929:]
y_test1 = target[:7979]
params1 = np.linalg.lstsq(x_train1, y_train1, rcond=None)[0]
print(f"Fold 1 RMS Error: {RME(x_test1, y_test1, params1)}")

#fold 2
x_train2 = np.vstack((A[:7929,:], A[15858:,:]))
x_test2 = data[7929:15858,:]
y_train2 = np.concatenate((target[:7929], target[15858:]))
y_test2 = target[7929:15858]
params2 = np.linalg.lstsq(x_train2, y_train2, rcond=None)[0]
print(f"Fold 2 RMS Error: {RME(x_test2, y_test2, params2)}")

#fold 3
x_train3 = np.vstack((A[:15858,:], A[23787:,:]))
x_test3 = data[15858:23787,:]
y_train3 = np.concatenate((target[:15858], target[23787:]))
y_test3 = target[15858:23787]
params3 = np.linalg.lstsq(x_train3, y_train3, rcond=None)[0]
print(f"Fold 3 RMS Error: {RME(x_test3, y_test3, params3)}")

#fold 4
x_train4 = np.vstack((A[:23787,:], A[31716:,:]))
x_test4 = data[23787:31716,:]
y_train4 = np.concatenate((target[:23787], target[31716:]))
y_test4 = target[23787:31716]
params4 = np.linalg.lstsq(x_train4, y_train4, rcond=None)[0]
print(f"Fold 4 RMS Error: {RME(x_test4, y_test4, params4)}")

#fold 5
x_train5 = A[:31716,:]
x_test5 = data[31716:,:]
y_train5 = target[:31716]
y_test5 = target[31716:]
params5 = np.linalg.lstsq(x_train5, y_train5, rcond=None)[0]
print(f"Fold 5 RMS Error: {RME(x_test5, y_test5, params5)}")

#take the average parameter values of the five folds
params_mean = np.zeros(59)
for i in range(59):
  params_mean[i] = (params1[i] + params2[i] + params3[i] + params4[i] + params5[i])/5
print(f"Cross Validated RMS Error: {RME(data, target, params_mean)}")
print(f"Normal Linear Regression RMS Error: {RME(data,target,params)}")

Fold 1 RMS Error: 12953.320171324256
Fold 2 RMS Error: 12239.76141359509
Fold 3 RMS Error: 15904.998478589163
Fold 4 RMS Error: 6448.062377989548
Fold 5 RMS Error: 8065.5797471211845
Cross Validated RMS Error: 11496.127994612103
Normal Linear Regression RMS Error: 11491.736924948345


#PART B:
Perform feature engineering. Come up with more interesting feature mappings or basis functions
for the linear least-squares data fitting or the linear least-squares classifier. For example, try out
a stratified model (Chapter 13.3.2) using the results of the k-means clustering you did earlier.
Choose the best model using cross-validation and report the RMS error. Make sure to save the
model parameters for each fold of the cross-validation. If you are doing classification, report the
confusion matrix for the best model you found.

##Feature Engineering - Product Interactions

###Determining 5 least significant factors

In [164]:
#Sort the parameters by magnitude and retrieve the five features that contribute the least
params_abs = abs(np.array(params[:58])) #take magnitude of feature parameters
#attach the correlating index to teh parameter value
indexes = list(range(0,58))
stack = np.vstack((indexes, params_abs))
#sort by parameter magnitude
stack = stack.T
sorted = stack[stack[:,1].argsort()]
print(sorted[:5])
worst_indexes = sorted[:5, 0] #list of indexes of least significant features
worst_indexes = worst_indexes.astype(np.int64)

[[31.          0.435544  ]
 [35.         10.36733501]
 [53.         17.43081268]
 [45.         22.70689072]
 [ 8.         23.18702348]]


###Performing Feature Engineering
We will be taking the 5 least significant features and multipling them with all other features to create a new feature to see if it is only important when considering another feature.

In [165]:
#For each of the 5 least significant features, create a new feature by multiplying it with the 57 other features (creates a total of 285 new features)
#B represents a modified A matrix
B=data.T
print(np.shape(B))
i = 0 #i keeps track of which bad index we are on
j = 0 #j represents all other features
for i in range(5):
  bad_index = worst_indexes[i]
  for j in range(58):
    feature_added = []
    if i==j:
      continue #irrelevent to consider a non significant feature with itself
    else:
      for k in range(39644): #k represents all the data points
        feature_added.append(B[bad_index,k]*B[j,k])
      B = np.vstack((B,feature_added))
#newdata represents a modified data matrix
newdata = B.T
B = np.vstack((B,np.ones(39644)))
B = B.T
"""
print(np.shape(newdata))
print(np.shape(B))
print("done")
"""

(58, 39644)


'\nprint(np.shape(newdata))\nprint(np.shape(B))\nprint("done")\n'

##Evaluate New Features

In [166]:
def newRME(x_test, y_test, params):
  preds = []
  data_size = x_test.shape[0]
  for i in range(data_size):
    pred = np.dot(x_test[i], params[:343]) + params[343]
    preds.append(pred)
  return math.sqrt((np.sum(np.square(np.subtract(y_test, preds))))/data_size)

In [167]:
newparams = np.linalg.lstsq(B, target, rcond=None)[0]


#5 folds, indexes: [0-7928, 7929-15857, 15858-23786, 23787-31715, 31716-39643]
#fold 1
newx_train1 = B[7929:,:]
newx_test1 = newdata[:7979,:]
newy_train1 = target[7929:]
newy_test1 = target[:7979]
newparams1 = np.linalg.lstsq(newx_train1, newy_train1, rcond=None)[0]
print(f"New Fold 1 RMS Error: {newRME(newx_test1, newy_test1, newparams1)}")


#fold 2
newx_train2 = np.vstack((B[:7929,:], B[15858:,:]))
newx_test2 = newdata[7929:15858,:]
newy_train2 = np.concatenate((target[:7929], target[15858:]))
newy_test2 = target[7929:15858]
newparams2 = np.linalg.lstsq(newx_train2, newy_train2, rcond=None)[0]
print(f"New Fold 2 RMS Error: {newRME(newx_test2, newy_test2, newparams2)}")


#fold 3
newx_train3 = np.vstack((B[:15858,:], B[23787:,:]))
newx_test3 = newdata[15858:23787,:]
newy_train3 = np.concatenate((target[:15858], target[23787:]))
newy_test3 = target[15858:23787]
newparams3 = np.linalg.lstsq(newx_train3, newy_train3, rcond=None)[0]
print(f"New Fold 3 RMS Error: {newRME(newx_test3, newy_test3, newparams3)}")


#fold 4
newx_train4 = np.vstack((B[:23787,:], B[31716:,:]))
newx_test4 = newdata[23787:31716,:]
newy_train4 = np.concatenate((target[:23787], target[31716:]))
newy_test4 = target[23787:31716]
newparams4 = np.linalg.lstsq(newx_train4, newy_train4, rcond=None)[0]
print(f"New Fold 4 RMS Error: {newRME(newx_test4, newy_test4, newparams4)}")


#fold 5
newx_train5 = B[:31716,:]
newx_test5 = newdata[31716:,:]
newy_train5 = target[:31716]
newy_test5 = target[31716:]
newparams5 = np.linalg.lstsq(newx_train5, newy_train5, rcond=None)[0]
print(f"New Fold 5 RMS Error: {newRME(newx_test5, newy_test5, newparams5)}")

#take the average parameter value of the five folds
newparams_mean = np.zeros(344)
for i in range(344):
 newparams_mean[i] = (newparams1[i] + newparams2[i] + newparams3[i] + newparams4[i] + newparams5[i])/5
print(f"New Cross Validated RMS Error: {newRME(newdata, target, newparams_mean)}")
print(f"New Linear Regression RMS Error: {newRME(newdata,target,newparams)}")

New Fold 1 RMS Error: 15793.41339679203
New Fold 2 RMS Error: 12375.805800802822
New Fold 3 RMS Error: 15960.51636812314
New Fold 4 RMS Error: 108183.84277710892
New Fold 5 RMS Error: 8171.10214671604
New Cross Validated RMS Error: 14987.738100057584
New Linear Regression RMS Error: 11428.975821063592


In [168]:
#regularizing linear model
test_set = A[:19822]
training_set = A[19822:]

test_outcome = target[:19822]
training_outcome = target[19822:]


reg_param_values = [.001,1,10,20,100,200,300,400,600,1000,1200,1300,1500,1800,2000,2300,2600,3000,4000,5000,6000,7000,8000,9000,10000]
fit_RMS = []
param_norms = []



for reg_param in reg_param_values:
    dim = A.shape[1]
    ident = np.identity(dim)
    ident *= reg_param
    ident[dim-1][dim-1] = 0
    mat = np.vstack((training_set, ident))

    

    outcome = np.concatenate((training_outcome, np.zeros(dim)))
    params = np.linalg.lstsq(mat, outcome, rcond=None)[0]
    param_norm = np.linalg.norm(np.delete(params,dim-1)) 
    param_norms = np.append(param_norms, param_norm)


    preds = []
    for i in range(test_set.shape[0]):
        pred = np.dot(test_set[i],params) 
        preds.append(pred)

    dif = preds - test_outcome
    norm_ = np.linalg.norm(dif)
    fit_RMS.append(norm_/np.sqrt(dif.shape[0]))


    
lowest_RMS = np.argmin(fit_RMS)

print(reg_param_values[lowest_RMS])
print(fit_RMS[lowest_RMS])
print(param_norms[lowest_RMS])




20
13821.7649707273
2386.922700781445


In [169]:
#regularizing linear model
test_set = B[:19822]
training_set = B[19822:]

test_outcome = target[:19822]
training_outcome = target[19822:]


reg_param_values = [.001,1,10,20,100,200,300,400,600,1000,1200,1300,1500,1800,2000,2300,2600,3000,4000,5000,6000,7000,8000,9000,10000]
fit_RMS = []
param_norm = []



for reg_param in reg_param_values:
    dim = B.shape[1]
    ident = np.identity(dim)
    ident *= reg_param
    ident[dim-1][dim-1] = 0
    mat = np.vstack((training_set, ident))

    

    outcome = np.concatenate((training_outcome, np.zeros(dim)))
    params = np.linalg.lstsq(mat, outcome, rcond=None)[0]
    param_norm = np.linalg.norm(np.delete(params,dim-1)) 
    param_norms = np.append(param_norms, param_norm)


    preds = []
    for i in range(test_set.shape[0]):
        pred = np.dot(test_set[i],params) 
        preds.append(pred)

    dif = preds - test_outcome
    norm_ = np.linalg.norm(dif)
    fit_RMS.append(norm_/np.sqrt(dif.shape[0]))
    
lowest_RMS = np.argmin(fit_RMS)

print(reg_param_values[lowest_RMS])
print(fit_RMS[lowest_RMS])
print(param_norms[lowest_RMS])



100
13834.784062169654
847.583449965849


In [170]:
#New Feature Engineering

new_matrix = A

print(A.shape)

for i in range(A.shape[0]):
    if A[i][7] > 0:
        A[i][7] = 1
    else:
        A[i][7] = 0

    if A[i][8] > 0:
        A[i][8] = 1
    else:
        A[i][8] = 0

    if A[i][26] > 0:
        A[i][26] = np.log(A[i][26])

    if A[i][27] > 0:
        A[i][27] = np.log(A[i][27])

    if A[i][28] > 0:
        A[i][28] = np.log(A[i][28])
    

#regularizing linear model
test_set = new_matrix[:19822]
training_set = new_matrix[19822:]

test_outcome = target[:19822]
training_outcome = target[19822:]


reg_param_values = [.001,1,10,20,100,200,300,400,600,1000,1200,1300,1500,1800,2000,2300,2600,3000,4000,5000,6000,7000,8000,9000,10000]
fit_RMS = []
param_norms = []



for reg_param in reg_param_values:
    dim = A.shape[1]
    ident = np.identity(dim)
    ident *= reg_param
    ident[dim-1][dim-1] = 0
    mat = np.vstack((training_set, ident))

    

    outcome = np.concatenate((training_outcome, np.zeros(dim)))
    params = np.linalg.lstsq(mat, outcome, rcond=None)[0]
    param_norm = np.linalg.norm(np.delete(params,dim-1)) 
    param_norms = np.append(param_norms, param_norm)


    preds = []
    for i in range(test_set.shape[0]):
        pred = np.dot(test_set[i],params) 
        preds.append(pred)

    dif = preds - test_outcome
    norm_ = np.linalg.norm(dif)
    fit_RMS.append(norm_/np.sqrt(dif.shape[0]))


    
lowest_RMS = np.argmin(fit_RMS)

print(reg_param_values[lowest_RMS])
print(fit_RMS[lowest_RMS])
print(param_norms[lowest_RMS])



(39644, 59)
20
13828.735274427861
2379.171643289668
