In [28]:
#import stuff here
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [29]:
#The file "Stand_Data.csv" generated from "Standardization.ipynb" is required
file_dir = "Stand_Data.csv"
data = np.loadtxt(file_dir, delimiter=",")
print(np.shape(data))

(39644, 58)


# PART A:
Start with a basic linear model (the features are only standardized and are used as they are
without any transformations) regardless if you are doing the least-squares data fitting (Chapter
13) or the least-squares classification (Chapter 14) task. Evaluate the initial model using cross-
validation and report the RMS error. Make sure to save the model parameters for each fold of
the cross-validation.

In [30]:
#The file 'OnlineNewsPopularity.csv' from https://archive.ics.uci.edu/dataset/332/online+news+popularity is required
file_dir2 = 'OnlineNewsPopularity.csv'
columns = [60]
target = np.loadtxt(file_dir2, delimiter = ',', skiprows=1, usecols = columns)
print(np.shape(target))

(39644,)


##Linear Model

In [31]:
#Add a column of 1's to A
A = data
A = np.vstack((A.T, np.ones(39644))).T
print(np.shape(A))

#solve least squares problem
params = np.linalg.lstsq(A, target, rcond=None)[0]
print(A)
print(params.shape)
print(params)

(39644, 59)
[[ 0.75744723 -0.69521045  0.03277187 ... -1.81071884  0.13891975
   1.        ]
 [-0.66165665 -0.61879381  0.01605588 ...  0.83774863 -0.68965812
   1.        ]
 [-0.66165665 -0.71219192  0.00764453 ...  0.83774863 -0.68965812
   1.        ]
 ...
 [-0.18862202 -0.2218518  -0.00904959 ... -1.56994907 -0.08705603
   1.        ]
 [-2.08076053  0.28759248 -0.00247749 ...  0.83774863 -0.68965812
   1.        ]
 [-0.18862202 -0.82681689  0.04367671 ... -0.92789635  0.41511238
   1.        ]]
(59,)
[ 1.89962795e+02  2.79654286e+02  1.40310024e+04 -7.76117453e+03
 -5.35589085e+03  3.00762239e+02 -2.22221887e+02  9.88583127e+01
  2.31870235e+01 -4.95430281e+02  9.44888758e+01 -2.35128600e+02
 -4.51564914e+02 -2.92529788e+02 -1.41611488e+02 -2.14064273e+02
 -1.97638480e+02  1.53805975e+02  3.36320289e+02 -2.15280309e+02
 -1.19857701e+02 -1.10524449e+02 -9.70837737e+01 -4.16232920e+02
 -1.23578078e+03  2.19137426e+03  5.16251880e+02  2.36391271e+02
 -1.39912987e+02  1.38961826e+02 -6

In [32]:
reg = LinearRegression().fit(data, target)
print(reg.score(data, target))
print(reg.coef_)
print(reg.intercept_)

0.02310029913793976
[ 1.89885342e+02  2.75578056e+02  1.40646547e+04 -8.08461306e+03
 -5.41118165e+03  3.01387813e+02 -2.22590892e+02  9.79674269e+01
  2.30460863e+01 -4.96275273e+02  9.53800618e+01 -2.31034113e+02
 -4.48680984e+02 -2.93608624e+02 -1.44016900e+02 -2.12867014e+02
 -1.97321928e+02  1.55145966e+02  3.33133295e+02 -2.11385383e+02
 -1.17317254e+02 -1.08609674e+02 -9.74203267e+01 -4.13097209e+02
 -1.23548504e+03  2.18920527e+03  5.18631343e+02  2.32458455e+02
 -1.37913286e+02 -4.76350833e+15 -4.96165022e+15 -4.97326088e+15
 -4.92955842e+15 -4.47056864e+15 -1.13935991e+16 -1.19890564e+16
  1.16538264e+16  2.82459375e+04  2.34067500e+04  2.99490625e+04
  3.15763750e+04  3.09303750e+04  2.89500000e+02  6.22500000e+01
 -2.26875000e+02  2.12500000e+01  4.18750000e+02  3.29062500e+02
 -1.68812500e+02 -1.45875000e+02  8.45000000e+01 -2.18375000e+02
  3.90000000e+01 -1.90000000e+01 -3.48125000e+01  5.75000000e+01
  1.17750000e+02  1.36250000e+02]
3403.895166794159


##Cross Validation

###Calculate RMS

In [33]:
def RME(x_test, y_test, params):
  preds = []
  data_size = x_test.shape[0]
  for i in range(data_size):
    pred = np.dot(x_test[i], params[:58]) + params[58]
    preds.append(pred)
  return math.sqrt((np.sum(np.square(np.subtract(y_test, preds))))/data_size)

print(RME(data,target,params))
coef = np.ndarray.tolist(reg.coef_)
coef.append(reg.intercept_)
print(RME(data,target,coef))

11491.736924948345
11491.750929050146


###Cross Validation Folds

In [34]:
#5 folds, indexes: [0-7928, 7929-15857, 15858-23786, 23787-31715, 31716-39643]
#fold 1
x_train1 = A[7929:,:]
x_test1 = data[:7979,:]
y_train1 = target[7929:]
y_test1 = target[:7979]
params1 = np.linalg.lstsq(x_train1, y_train1, rcond=None)[0]
print(RME(x_test1, y_test1, params1))

#fold 2
x_train2 = np.vstack((A[:7929,:], A[15858:,:]))
x_test2 = data[7929:15858,:]
y_train2 = np.concatenate((target[:7929], target[15858:]))
y_test2 = target[7929:15858]
params2 = np.linalg.lstsq(x_train2, y_train2, rcond=None)[0]
print(RME(x_test2, y_test2, params2))

#fold 3
x_train3 = np.vstack((A[:15858,:], A[23787:,:]))
x_test3 = data[15858:23787,:]
y_train3 = np.concatenate((target[:15858], target[23787:]))
y_test3 = target[15858:23787]
params3 = np.linalg.lstsq(x_train3, y_train3, rcond=None)[0]
print(RME(x_test3, y_test3, params3))

#fold 4
x_train4 = np.vstack((A[:23787,:], A[31716:,:]))
x_test4 = data[23787:31716,:]
y_train4 = np.concatenate((target[:23787], target[31716:]))
y_test4 = target[23787:31716]
params4 = np.linalg.lstsq(x_train4, y_train4, rcond=None)[0]
print(RME(x_test4, y_test4, params4))

#fold 5
x_train5 = A[:31716,:]
x_test5 = data[31716:,:]
y_train5 = target[:31716]
y_test5 = target[31716:]
params5 = np.linalg.lstsq(x_train5, y_train5, rcond=None)[0]
print(RME(x_test5, y_test5, params5))

params_mean = np.zeros(59)
for i in range(59):
  params_mean[i] = (params1[i] + params2[i] + params3[i] + params4[i] + params5[i])/5
print(params_mean)
print(RME(data, target, params_mean))

12953.320171324256
12239.76141359509
15904.998478589163
6448.062377989548
8065.5797471211845
[ 1.93183271e+02  2.79467470e+02  1.39594358e+04 -7.75906090e+03
 -5.37010981e+03  2.99101043e+02 -2.24465857e+02  9.88806733e+01
  2.28850195e+01 -4.90113394e+02  9.42644785e+01 -2.34176560e+02
 -4.46304487e+02 -2.92432699e+02 -1.41167300e+02 -2.10694411e+02
 -1.96441232e+02  5.14385344e+02  3.21578383e+02 -2.08153748e+02
 -1.21176208e+02 -3.60169696e+01 -9.01767456e+01 -4.13899817e+02
 -1.23643857e+03  2.18716540e+03  5.47126462e+02  2.60357990e+02
 -1.74665573e+02  1.38838178e+02 -6.55808564e+01 -3.95252998e-01
 -6.91592810e+01 -4.85235654e+01  8.37596961e+01 -1.02961957e+01
  5.20886382e+01  4.46662382e+04  3.71285319e+04  4.75702330e+04
  5.00068734e+04  4.89914829e+04  2.87023955e+02  6.52627604e+01
 -2.35642812e+02  2.39569710e+01  4.07141766e+02  3.13425982e+02
 -1.70016049e+02 -1.38700370e+02  8.49504088e+01 -2.16056166e+02
  3.49133616e+01 -1.81779533e+01 -3.25026155e+01  5.59411916e+

#PART B:
Perform feature engineering. Come up with more interesting feature mappings or basis functions
for the linear least-squares data fitting or the linear least-squares classifier. For example, try out
a stratified model (Chapter 13.3.2) using the results of the k-means clustering you did earlier.
Choose the best model using cross-validation and report the RMS error. Make sure to save the
model parameters for each fold of the cross-validation. If you are doing classification, report the
confusion matrix for the best model you found.

##Product Interactions

###Determining 5 least significant factors

In [35]:
params_abs = abs(np.array(params[:58]))
indexes = list(range(0,58))
stack = np.vstack((indexes, params_abs))
stack = stack.T
sorted = stack[stack[:,1].argsort()]
print(sorted[:5])
worst_indexes = [31, 35, 53, 45, 8]


[[31.          0.435544  ]
 [35.         10.36733501]
 [53.         17.43081268]
 [45.         22.70689072]
 [ 8.         23.18702348]]


#Performing Feature Engineering

In [36]:
B=data.T
print(np.shape(B))
i = 0 #
j = 0
for i in range(5):
  bad_index = worst_indexes[i]
  for j in range(58):
    feature_added = []
    if i==j:
      continue
    else:
      for k in range(39644):
        feature_added.append(B[bad_index,k]*B[j,k])
      B = np.vstack((B,feature_added))
newdata = B.T
B = np.vstack((B,np.ones(39644)))
B = B.T
print(np.shape(newdata))
print(np.shape(B))
print("done")

(58, 39644)
(39644, 343)
(39644, 344)
done


In [37]:
def newRME(x_test, y_test, params):
  preds = []
  data_size = x_test.shape[0]
  for i in range(data_size):
    pred = np.dot(x_test[i], params[:343]) + params[343]
    preds.append(pred)
  return math.sqrt((np.sum(np.square(np.subtract(y_test, preds))))/data_size)

In [38]:
newparams = np.linalg.lstsq(B, target, rcond=None)[0]
print(newRME(newdata,target,newparams))

#5 folds, indexes: [0-7928, 7929-15857, 15858-23786, 23787-31715, 31716-39643]
#fold 1
newx_train1 = B[7929:,:]
newx_test1 = newdata[:7979,:]
newy_train1 = target[7929:]
newy_test1 = target[:7979]
newparams1 = np.linalg.lstsq(newx_train1, newy_train1, rcond=None)[0]
print(newRME(newx_test1, newy_test1, newparams1))


#fold 2
newx_train2 = np.vstack((B[:7929,:], B[15858:,:]))
newx_test2 = newdata[7929:15858,:]
newy_train2 = np.concatenate((target[:7929], target[15858:]))
newy_test2 = target[7929:15858]
newparams2 = np.linalg.lstsq(newx_train2, newy_train2, rcond=None)[0]
print(newRME(newx_test2, newy_test2, newparams2))


#fold 3
newx_train3 = np.vstack((B[:15858,:], B[23787:,:]))
newx_test3 = newdata[15858:23787,:]
newy_train3 = np.concatenate((target[:15858], target[23787:]))
newy_test3 = target[15858:23787]
newparams3 = np.linalg.lstsq(newx_train3, newy_train3, rcond=None)[0]
print(newRME(newx_test3, newy_test3, newparams3))


#fold 4
newx_train4 = np.vstack((B[:23787,:], B[31716:,:]))
newx_test4 = newdata[23787:31716,:]
newy_train4 = np.concatenate((target[:23787], target[31716:]))
newy_test4 = target[23787:31716]
newparams4 = np.linalg.lstsq(newx_train4, newy_train4, rcond=None)[0]
print(newRME(newx_test4, newy_test4, newparams4))


#fold 5
newx_train5 = B[:31716,:]
newx_test5 = newdata[31716:,:]
newy_train5 = target[:31716]
newy_test5 = target[31716:]
newparams5 = np.linalg.lstsq(newx_train5, newy_train5, rcond=None)[0]
print(newRME(newx_test5, newy_test5, newparams5))


newparams_mean = np.zeros(344)
for i in range(344):
 newparams_mean[i] = (newparams1[i] + newparams2[i] + newparams3[i] + newparams4[i] + newparams5[i])/5
print(newparams_mean)
print(newRME(newdata, target, newparams_mean))

11428.975821063592
15793.41339679203
12375.805800802822
15960.51636812314
108183.84277710892
8171.10214671604
[ 1.95258919e+02  9.33575939e+01  2.92549892e+03 -2.61191776e+11
 -1.30584334e+03  3.17093135e+02 -2.16700280e+02  1.43966068e+02
  8.15521482e+02 -2.07730545e+02  6.67422164e+01 -2.08774264e+02
 -4.63958289e+02 -2.31174672e+02 -1.82990823e+02 -2.34236361e+02
 -2.29608217e+02  5.36653855e+02  2.82626163e+02 -1.62371757e+02
 -1.20223987e+02  2.40165954e+01 -1.46354399e+02 -3.92366686e+02
 -1.14247042e+03  2.07478506e+03  5.53039468e+02  1.52651052e+02
 -5.50406807e+01  1.43496747e+08  1.50794632e+08 -5.58749238e+08
  1.49580400e+08  1.33207316e+08  1.24378808e+08 -1.13726259e+08
  3.36271461e+06  9.88050001e+05  8.25290329e+05  1.05969214e+06
  1.10893798e+06  1.08644371e+06  1.88532864e+02  7.28684967e+01
 -4.15269039e+02  3.03490374e+08  1.31805422e+09  1.08209789e+09
 -1.53947538e+02 -1.44153398e+02  9.51504960e+01 -9.13939083e+01
 -9.60573555e+01 -2.81702082e+08 -2.98531700e

In [41]:
#regularizing linear model
test_set = A[:19822]
training_set = A[19822:]

test_outcome = target[:19822]
training_outcome = target[19822:]


reg_param_values = [.001,1,10,20,100,200,300,400,600,1000,1200,1300,1500,1800,2000,2300,2600,3000,4000,5000,6000,7000,8000,9000,10000]
param_values_list = np.array([])
fit_RMS = []
param_norms = []



for reg_param in reg_param_values:
    dim = A.shape[1]
    ident = np.identity(dim)
    ident *= reg_param
    ident[dim-1][dim-1] = 0
    mat = np.vstack((training_set, ident))

    

    outcome = np.concatenate((training_outcome, np.zeros(dim)))
    params = np.linalg.lstsq(mat, outcome, rcond=None)[0]
    param_values_list = np.append(param_values_list, params)
    param_norm = np.linalg.norm(np.delete(params,dim-1)) 
    param_norms = np.append(param_norms, param_norm)


    preds = []
    for i in range(test_set.shape[0]):
        pred = np.dot(test_set[i],params) 
        preds.append(pred)

    dif = preds - test_outcome
    norm_ = np.linalg.norm(dif)
    fit_RMS.append(norm_/np.sqrt(dif.shape[0]))
    
lowest_RMS = np.argmin(fit_RMS)

print(reg_param_values[lowest_RMS])
print(fit_RMS[lowest_RMS])
print(param_norms[lowest_RMS])
print(param_values_list[lowest_RMS])




20
13821.7649707273
2386.922700781445
-626.5310495963595


In [42]:
#regularizing linear model
test_set = B[:19822]
training_set = B[19822:]

test_outcome = target[:19822]
training_outcome = target[19822:]


reg_param_values = [.001,1,10,20,100,200,300,400,600,1000,1200,1300,1500,1800,2000,2300,2600,3000,4000,5000,6000,7000,8000,9000,10000]
param_values_list = np.array([])
fit_RMS = []
param_norm = []



for reg_param in reg_param_values:
    dim = B.shape[1]
    ident = np.identity(dim)
    ident *= reg_param
    ident[dim-1][dim-1] = 0
    mat = np.vstack((training_set, ident))

    

    outcome = np.concatenate((training_outcome, np.zeros(dim)))
    params = np.linalg.lstsq(mat, outcome, rcond=None)[0]
    param_values_list = np.concatenate((param_values_list, params))
    param_norm = np.linalg.norm(np.delete(params,dim-1)) 
    param_norms = np.append(param_norms, param_norm)


    preds = []
    for i in range(test_set.shape[0]):
        pred = np.dot(test_set[i],params) 
        preds.append(pred)

    dif = preds - test_outcome
    norm_ = np.linalg.norm(dif)
    fit_RMS.append(norm_/np.sqrt(dif.shape[0]))
    
lowest_RMS = np.argmin(fit_RMS)

print(reg_param_values[lowest_RMS])
print(fit_RMS[lowest_RMS])
print(param_norms[lowest_RMS])
print(param_values_list[lowest_RMS])



100
13834.784062169654
847.583449965849
-19853.203659197385
