In [30]:
import pandas as pd
import numpy as np

In [85]:
data_raw = pd.read_csv("fish.csv")
data = np.array(data_raw)
data = np.array(data[:, 1:], dtype=np.float64) # Exclude species dimension
np.random.shuffle(data)

training_data = data[:int(len(data) * 0.8)]
test_data = data[-int(len(data) * 0.2):]

X,y = training_data[:, 1:], training_data[:,0]
X_test, y_test = test_data[:, 1:], test_data[:,0]

In [86]:
class LinearRegression():
    """
    Linear regression implementation.
    """

    def __init__(self):
        
        pass
            
    def fit(self, X, y):
        """
        Fits the linear regression model.

        Parameters
        ----------
        X : Array of shape [n_samples, n_features]
        y : Array of shape [n_samples, 1]
        """        

        # make sure that we have multidimensional numpy arrays
        X = np.array(X).reshape((X.shape[0], -1))
        # IMPORTANT: Make sure that we have a column vector! 
        y = np.array(y).reshape((len(y), 1))
        
        # prepend a column of ones
        ones = np.ones((X.shape[0], 1))
        # augmented data matrix!
        X = np.concatenate((ones, X), axis=1)
        X.astype(np.float64)
        y.astype(np.float64)
        # compute weights
        XTX = np.dot(X.T, X)
        XTy = np.dot(X.T, y)

        # Option 1
        # matrix multiplication in numpy: numpy.dot(X.T, X) 
        XTXinv = np.linalg.inv(XTX)
        self._w = np.dot(XTXinv, XTy)
        
        # Option 2
        #self._w = np.linalg.solve(XTX, XTy)
                
        # Option 3
        #XTXpinv = numpy.linalg.pinv(XTX)
        #self._w = numpy.dot(XTXpinv, XTy)
        
        # Option 4 
        # ...
                
    def predict(self, X):
        """
        Computes predictions for a new set of points.

        Parameters
        ----------
        X : Array of shape [n_samples, n_features]

        Returns
        -------
        predictions : Array of shape [n_samples, 1]
        """                     

        # make sure that we have multidimensional numpy arrays
        X = np.array(X).reshape((X.shape[0], -1))

        # prepend a column of ones
        ones = np.ones((X.shape[0], 1))
        X = np.concatenate((ones, X), axis=1)           

        # compute predictions
        predictions = np.dot(X, self._w)

        return predictions

In [87]:
model = LinearRegression()
model.fit(X, y)

In [88]:
predictions_train = model.predict(X)
predictions_test = model.predict(X_test)

array([489509.40310087])

In [102]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, predictions_test)
MSE

15790.625906479747

In [140]:
data_raw2 = data_raw
data2 = np.array(data_raw2)
mapping = {'Bream' : 0, 'Roach' : 1, 'Whitefish': 2, 'Parkki': 3, 'Perch': 4, 'Pike': 5, 'Smelt': 6}
for row in data2:
    row[0] = mapping[row[0]]
data2 = np.array(data2, dtype=np.float64)
np.random.shuffle(data2)

In [141]:
training_data2 = data2[:int(len(data2) * 0.8)]
test_data2 = data2[-int(len(data2) * 0.2):]

X2,y2 = training_data2[:,0:], training_data2[:, 2]
X_test2, y_test2 = test_data2[:, 0:], test_data2[:,2]

In [142]:
model = LinearRegression()
model.fit(X2, y2)

In [143]:
predictions_train2 = model.predict(X2)
predictions_test2 = model.predict(X_test2)
MSE2 = mean_squared_error(y_test2, predictions_test2)
MSE2

2.593282478057522e-20