In [19]:
#import stuff here
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [20]:
#The file "Stand_Data.csv" generated from "Standardization.ipynb" is required
file_dir = "Stand_Data.csv"
data = np.loadtxt(file_dir, delimiter=",")
print(np.shape(data))

(39644, 58)


# PART A:
Start with a basic linear model (the features are only standardized and are used as they are
without any transformations) regardless if you are doing the least-squares data fitting (Chapter
13) or the least-squares classification (Chapter 14) task. Evaluate the initial model using cross-
validation and report the RMS error. Make sure to save the model parameters for each fold of
the cross-validation.

In [21]:
#The file 'OnlineNewsPopularity.csv' from https://archive.ics.uci.edu/dataset/332/online+news+popularity is required
file_dir2 = 'OnlineNewsPopularity.csv'
columns = [60]
target = np.loadtxt(file_dir2, delimiter = ',', skiprows=1, usecols = columns)
print(np.shape(target))

(39644,)


##Linear Model

In [129]:
#Add a column of 1's to A
A = np.vstack((A.T, np.ones(39644))).T
print(np.shape(A))

#solve least squares problem
params = np.linalg.lstsq(A, target, rcond=None)[0]

(39644, 62)


In [23]:
reg = LinearRegression().fit(data, target)
print(reg.score(data, target))
print(reg.coef_)
print(reg.intercept_)

0.023098604445738635
[ 1.89845866e+02  2.79199628e+02  1.40354092e+04 -7.74717824e+03
 -5.36557893e+03  3.01105309e+02 -2.22768900e+02  9.90186708e+01
  2.31069703e+01 -4.96068872e+02  9.44345146e+01 -2.35605401e+02
 -4.51192773e+02 -2.92704010e+02 -1.41723834e+02 -2.14108343e+02
 -1.97433062e+02  1.53620715e+02  3.36329426e+02 -2.15662788e+02
 -1.20042659e+02 -1.10894755e+02 -9.74003017e+01 -4.15916996e+02
 -1.23586038e+03  2.19189149e+03  5.16418761e+02  2.36146814e+02
 -1.39797971e+02  8.72869378e+14  9.09177069e+14  9.11304618e+14
  9.03296539e+14  8.19190857e+14 -5.00166667e+14 -5.26306598e+14
  1.48772302e+15  4.67859219e+04  3.88980000e+04  4.98392812e+04
  5.23865312e+04  5.13192188e+04  2.88812500e+02  6.42187500e+01
 -2.33359375e+02  2.15625000e+01  4.02687500e+02  3.12796875e+02
 -1.67828125e+02 -1.40914062e+02  8.65000000e+01 -2.19671875e+02
  3.74375000e+01 -1.76250000e+01 -3.22343750e+01  5.61250000e+01
  1.21375000e+02  1.38031250e+02]
3397.329624130047


##Cross Validation

###Calculate RMS

In [24]:
def RME(x_test, y_test, params):
  preds = []
  data_size = x_test.shape[0]
  for i in range(data_size):
    pred = np.dot(x_test[i], params[:58]) + params[58]
    preds.append(pred)
  return math.sqrt((np.sum(np.square(np.subtract(y_test, preds))))/data_size)

print(RME(data,target,params))
coef = np.ndarray.tolist(reg.coef_)
coef.append(reg.intercept_)
print(RME(data,target,coef))

11491.736924948345
11491.738500150483


###Cross Validation Folds

In [45]:
#5 folds, indexes: [0-7928, 7929-15857, 15858-23786, 23787-31715, 31716-39643]
#fold 1
x_train1 = A[7929:,:]
x_test1 = data[:7979,:]
y_train1 = target[7929:]
y_test1 = target[:7979]
params1 = np.linalg.lstsq(x_train1, y_train1, rcond=None)[0]
print(RME(x_test1, y_test1, params1))

#fold 2
x_train2 = np.vstack((A[:7929,:], A[15858:,:]))
x_test2 = data[7929:15858,:]
y_train2 = np.concatenate((target[:7929], target[15858:]))
y_test2 = target[7929:15858]
params2 = np.linalg.lstsq(x_train2, y_train2, rcond=None)[0]
print(RME(x_test2, y_test2, params2))

#fold 3
x_train3 = np.vstack((A[:15858,:], A[23787:,:]))
x_test3 = data[15858:23787,:]
y_train3 = np.concatenate((target[:15858], target[23787:]))
y_test3 = target[15858:23787]
params3 = np.linalg.lstsq(x_train3, y_train3, rcond=None)[0]
print(RME(x_test3, y_test3, params3))

#fold 4
x_train4 = np.vstack((A[:23787,:], A[31716:,:]))
x_test4 = data[23787:31716,:]
y_train4 = np.concatenate((target[:23787], target[31716:]))
y_test4 = target[23787:31716]
params4 = np.linalg.lstsq(x_train4, y_train4, rcond=None)[0]
print(RME(x_test4, y_test4, params4))

#fold 5
x_train5 = A[:31716,:]
x_test5 = data[31716:,:]
y_train5 = target[:31716]
y_test5 = target[31716:]
params5 = np.linalg.lstsq(x_train5, y_train5, rcond=None)[0]
print(RME(x_test5, y_test5, params5))

params_mean = np.zeros(59)
for i in range(59):
  params_mean[i] = (params1[i] + params2[i] + params3[i] + params4[i] + params5[i])/5
print(params_mean)
print(RME(data, target, params_mean))

12953.320171324669
12239.761413595115
15904.998478589145
6448.062377989492
8065.5797471211845
[ 1.93183271e+02  2.79467470e+02  1.39594358e+04 -7.75906090e+03
 -5.37010981e+03  2.99101043e+02 -2.24465857e+02  9.88806733e+01
  2.28850195e+01 -4.90113394e+02  9.42644785e+01 -2.34176560e+02
 -4.46304487e+02 -2.92432699e+02 -1.41167300e+02 -2.10694411e+02
 -1.96441232e+02  5.14385344e+02  3.21578383e+02 -2.08153748e+02
 -1.21176208e+02 -3.60169696e+01 -9.01767456e+01 -4.13899817e+02
 -1.23643857e+03  2.18716540e+03  5.47126462e+02  2.60357990e+02
 -1.74665573e+02  1.38838178e+02 -6.55808563e+01 -3.95252864e-01
 -6.91592809e+01 -4.85235653e+01  8.37596964e+01 -1.02961954e+01
  5.20886380e+01  4.46662382e+04  3.71285319e+04  4.75702330e+04
  5.00068734e+04  4.89914829e+04  2.87023955e+02  6.52627604e+01
 -2.35642812e+02  2.39569710e+01  4.07141766e+02  3.13425982e+02
 -1.70016049e+02 -1.38700370e+02  8.49504088e+01 -2.16056166e+02
  3.49133616e+01 -1.81779533e+01 -3.25026155e+01  5.59411916e

#PART B:
Perform feature engineering. Come up with more interesting feature mappings or basis functions
for the linear least-squares data fitting or the linear least-squares classifier. For example, try out
a stratified model (Chapter 13.3.2) using the results of the k-means clustering you did earlier.
Choose the best model using cross-validation and report the RMS error. Make sure to save the
model parameters for each fold of the cross-validation. If you are doing classification, report the
confusion matrix for the best model you found.

##Product Interactions

###Determining 5 least significant factors

In [109]:
params_abs = abs(np.array(params[:58]))
indexes = list(range(0,58))
stack = np.vstack((indexes, params_abs))
stack = stack.T
sorted = stack[stack[:,1].argsort()]
print(sorted[:5])
worst_indexes = [31, 35, 53, 45, 8]


[[31.          0.43554399]
 [35.         10.367335  ]
 [53.         17.43081268]
 [45.         22.70689072]
 [ 8.         23.18702348]]


#Performing Feature Engineering

In [131]:
B=data.T
print(np.shape(B))
i = 0 #
j = 0
for i in range(5):
  bad_index = worst_indexes[i]
  for j in range(58):
    feature_added = []
    if i==j:
      continue
    else:
      for k in range(39644):
        feature_added.append(B[bad_index,k]*B[j,k])
      B = np.vstack((B,feature_added))
newdata = B.T
B = np.vstack((B,np.ones(39644)))
B = B.T
print(np.shape(newdata))
print(np.shape(B))
print("done")

(58, 39644)
(39644, 343)
(39644, 344)
done


In [135]:
def newRME(x_test, y_test, params):
  preds = []
  data_size = x_test.shape[0]
  for i in range(data_size):
    pred = np.dot(x_test[i], params[:343]) + params[343]
    preds.append(pred)
  return math.sqrt((np.sum(np.square(np.subtract(y_test, preds))))/data_size)

In [139]:
newparams = np.linalg.lstsq(B, target, rcond=None)[0]
print(newRME(newdata,target,newparams))

#5 folds, indexes: [0-7928, 7929-15857, 15858-23786, 23787-31715, 31716-39643]
#fold 1
newx_train1 = B[7929:,:]
newx_test1 = newdata[:7979,:]
newy_train1 = target[7929:]
newy_test1 = target[:7979]
newparams1 = np.linalg.lstsq(newx_train1, newy_train1, rcond=None)[0]
print(newRME(newx_test1, newy_test1, newparams1))


#fold 2
newx_train2 = np.vstack((B[:7929,:], B[15858:,:]))
newx_test2 = newdata[7929:15858,:]
newy_train2 = np.concatenate((target[:7929], target[15858:]))
newy_test2 = target[7929:15858]
newparams2 = np.linalg.lstsq(newx_train2, newy_train2, rcond=None)[0]
print(newRME(newx_test2, newy_test2, newparams2))


#fold 3
newx_train3 = np.vstack((B[:15858,:], B[23787:,:]))
newx_test3 = newdata[15858:23787,:]
newy_train3 = np.concatenate((target[:15858], target[23787:]))
newy_test3 = target[15858:23787]
newparams3 = np.linalg.lstsq(newx_train3, newy_train3, rcond=None)[0]
print(newRME(newx_test3, newy_test3, newparams3))


#fold 4
newx_train4 = np.vstack((B[:23787,:], B[31716:,:]))
newx_test4 = newdata[23787:31716,:]
newy_train4 = np.concatenate((target[:23787], target[31716:]))
newy_test4 = target[23787:31716]
newparams4 = np.linalg.lstsq(newx_train4, newy_train4, rcond=None)[0]
print(newRME(newx_test4, newy_test4, newparams4))


#fold 5
newx_train5 = B[:31716,:]
newx_test5 = newdata[31716:,:]
newy_train5 = target[:31716]
newy_test5 = target[31716:]
newparams5 = np.linalg.lstsq(newx_train5, newy_train5, rcond=None)[0]
print(newRME(newx_test5, newy_test5, newparams5))


newparams_mean = np.zeros(344)
for i in range(344):
 newparams_mean[i] = (newparams1[i] + newparams2[i] + newparams3[i] + newparams4[i] + newparams5[i])/5
print(newparams_mean)
print(newRME(newdata, target, newparams_mean))

11428.975821072636
15793.413283047772
12375.805830382373
15960.516358128973
112515.07525027993
8171.10216403552
[ 1.95258984e+02  9.33575605e+01  2.92553128e+03 -2.61191699e+11
 -1.30587235e+03  3.17092988e+02 -2.16700019e+02  1.43966008e+02
  8.15112957e+02 -2.07730129e+02  6.67421104e+01 -2.08774217e+02
 -4.63958465e+02 -2.31174625e+02 -1.82990814e+02 -2.34236207e+02
 -2.29608133e+02  5.36653552e+02  2.82625995e+02 -1.62371625e+02
 -1.20224005e+02  2.40165811e+01 -1.46354478e+02 -3.92366746e+02
 -1.14247044e+03  2.07478522e+03  5.53039458e+02  1.52650982e+02
 -5.50407039e+01  1.43499009e+08  1.50804923e+08 -5.58756148e+08
  1.49576584e+08  1.33168943e+08  1.24382140e+08 -1.13732549e+08
  3.38004629e+06  1.00234327e+06  8.37231922e+05  1.07502737e+06
  1.12498227e+06  1.10216149e+06  1.88532895e+02  7.28684788e+01
 -4.15269033e+02  3.03487687e+08  1.31804194e+09  1.08208780e+09
 -1.53947421e+02 -1.44153373e+02  9.51504992e+01 -9.13938278e+01
 -9.60573496e+01 -2.81707123e+08 -2.9853145