In [2]:
from os.path import exists
import requests
from typing import *

import numpy as np
import pandas as pd
from tqdm.notebook import trange, tqdm

# Homework 1: Linear Algebra and Linear Regression

1. Answer each question.
2. Feel free to leave in testing code and other visualization code.

## Problem 1 (30 pts)

Determine if each column of a $(n x m)$ matrix $X$ is a basis for $R^n$ where $n \geq 1$ and $m \geq 1$.

Example 1:
$$
A = \begin{pmatrix}
0.5 & 0 \\
0 & -0.2 
\end{pmatrix}
$$

```
check_basis(A) = true
```

Example 2:
$$
B = \begin{pmatrix}
0.5 \\
0 
\end{pmatrix}
$$

```
check_basis(B) = false
```

Example 3:
$$
C = \begin{pmatrix}
0.5 & 0 \\
0.2 & -0.2 \\
0.3 & -0.4 
\end{pmatrix}
$$

```
check_basis(C) = false
```

Example 4:
$$
D = \begin{pmatrix}
0.5 & 0 & 1 \\
0.2 & -0.2 & 1 \\
\end{pmatrix}
$$

```
check_basis(D) = true
```

In [24]:
def check_basis(X: np.ndarray) -> bool:
    shape = X.shape
# if the number of columns is less than rows, we cannot have a spanning set.
# must have at least n vectors to span a R**n space
    if shape[1] < shape[0]:
        return False
    b = np.zeros(X.shape)
    for col in range(shape[1]-1):
        curcol = [X[:,col]]
        curcol = np.transpose(curcol)
#Test linear independece of each vector
        for nextcol in range(col+1, shape[1]):
            nextcol = [X[:, nextcol]]
            nextcol = np.transpose(nextcol)
            A = np.concatenate((curcol, nextcol), axis =1)
            # If any vector is not linearly independent, return false
            try:
                if (np.sum(np.linalg.solve(A, b)) != 0):
                    return False
            except:
                return False
    return True
A = np.array([[0.5, 0], [0, -0.2]])
print("A: ",check_basis(A))

B = np.array([[0.5],[0]])
print("B: ",check_basis(B))

C = np.array([[0.5, 0], [0.2, -0.2],[0.3,-0.4]])
print("C: ", check_basis(C))

D = np.array([[0.5, 0,1], [0.2, -0.2,1]])
print("D: ",check_basis(D))


# TEST
# E = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
# print("E: ",check_basis(E))

            
        

A:  True
B:  False
C:  False
D:  True


In [3]:
a =np.array([[0.5,0,1],[0.2,-0.2,1]])
a

array([[ 0.5,  0. ,  1. ],
       [ 0.2, -0.2,  1. ]])

In [4]:
x = [a[:,2]]
y = [a[:,1]]
xt = np.transpose(x)
yt = np.transpose(y)
print(np.transpose(x))
print(np.transpose(y))


[[1.]
 [1.]]
[[ 0. ]
 [-0.2]]


In [5]:
z = np.concatenate((xt,yt),axis=1)
z

array([[ 1. ,  0. ],
       [ 1. , -0.2]])

## Problem 2 (30 pts)

We saw that we could solve
$$
Ax = b
$$
where $A$ is a $n \times n$ matrix, $x$ is a $n \times 1$ vector, and $b$ is a $n \times 1$ vector.

### Problem 2a (20 pts)

Write a function that solves the matrix equation for $n \times n$ $X$
$$
AX = B
$$
where $A$ is a $n \times n$ matrix and $B$ is a $n \times n$ matrix.

In [6]:
a = np.array([[2,0],[0,1]])
b = np.array([[4],[2]])
def solve_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    x = np.dot(np.linalg.inv(A), B)
    
    return x
    pass
solve_matrix(a,b)

array([[2.],
       [2.]])

### Problem 2b (10 pts)

Write a function that solves the matrix equation for $n \times n$ $X$
$$
A_1 A_2 X = B
$$
where $A_1$, $A_2$, and $B$ are $n \times n$ matrices.

In [7]:
def solve_matrix2(A1: np.ndarray, A2: np.ndarray, B: np.ndarray) -> np.ndarray:
    x = np.dot(np.dot(np.linalg.inv(A1), np.linalg.inv(A2)),B)
    return x
    pass
print(solve_matrix2(a,a,b))

[[1.]
 [2.]]


## Problem 3 (40 pts)

Now we'll get to practice using linear regression on a dataset.

In [8]:
if not exists('casp.csv'):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv"
    response = requests.get(url, stream=True)

    with open("casp.csv", "wb") as f:
        for data in tqdm(response.iter_content()):
            f.write(data)

In [9]:
df = pd.read_csv('casp.csv')
df

Unnamed: 0,RMSD,F1,F2,F3,F4,F5,F6,F7,F8,F9
0,17.284,13558.30,4305.35,0.31754,162.1730,1.872791e+06,215.3590,4287.87,102,27.0302
1,6.021,6191.96,1623.16,0.26213,53.3894,8.034467e+05,87.2024,3328.91,39,38.5468
2,9.275,7725.98,1726.28,0.22343,67.2887,1.075648e+06,81.7913,2981.04,29,38.8119
3,15.851,8424.58,2368.25,0.28111,67.8325,1.210472e+06,109.4390,3248.22,70,39.0651
4,7.962,7460.84,1736.94,0.23280,52.4123,1.021020e+06,94.5234,2814.42,41,39.9147
...,...,...,...,...,...,...,...,...,...,...
45725,3.762,8037.12,2777.68,0.34560,64.3390,1.105797e+06,112.7460,3384.21,84,36.8036
45726,6.521,7978.76,2508.57,0.31440,75.8654,1.116725e+06,102.2770,3974.52,54,36.0470
45727,10.356,7726.65,2489.58,0.32220,70.9903,1.076560e+06,103.6780,3290.46,46,37.4718
45728,9.791,8878.93,3055.78,0.34416,94.0314,1.242266e+06,115.1950,3421.79,41,35.6045


### Problem 3a (10 pts)

Write a function that splits a data matrix X into two halves according to train which gives the percentage of the dataset to put in the first half. Each row of the data matrix corresponds to a point and each column corresponds to a feature.

1. A number of 50 means 50% in the first half and 50% in the second half.
2. A number of 20 means 20% in the first half and 80% in the second half.

In [10]:
a =np.array([[0.5,0,1],[0.2,-0.2,1],[0.5,0,1],[0.2,-0.2,1],[0.5,0,1],[0.2,-0.2,1],[0.5,0,1],[0.2,-0.2,1],[0.5,0,1],[0.2,-0.2,1]])

In [11]:
def split_dataset(X: np.ndarray, train: int) -> (np.ndarray, np.ndarray):
    if 0 > train or train > 100:
        raise ValueError("Train should be between 0 and 100")
    row = int((train * X.shape[0])/100)
    return ([X[0:row,:]], [X[row:,:]])
split_dataset(a, 30)

([array([[ 0.5,  0. ,  1. ],
         [ 0.2, -0.2,  1. ],
         [ 0.5,  0. ,  1. ]])],
 [array([[ 0.2, -0.2,  1. ],
         [ 0.5,  0. ,  1. ],
         [ 0.2, -0.2,  1. ],
         [ 0.5,  0. ,  1. ],
         [ 0.2, -0.2,  1. ],
         [ 0.5,  0. ,  1. ],
         [ 0.2, -0.2,  1. ]])])

### Problem 3b (10 pts)

Write a function that takes a Pandas dataframe `df`, a list of features `feature_keys` which are columns in a Pandas dataframe, and a Pandas key `y_key` that is a column corresponding to the regression value, and produces the weights of the linear regression.
1. The constant weight should be the $0$-th index.
2. The weights should be given in the order of the features.

In [12]:
A = np.concatenate([np.array([[1],[2],[3]]), np.ones((3,1))],axis=1)
A

array([[1., 1.],
       [2., 1.],
       [3., 1.]])

In [13]:
X= np.array([[1,2,3],[4,5,6]])
A = np.concatenate([X,np.ones((X.shape[0],1))],axis=1)
A

array([[1., 2., 3., 1.],
       [4., 5., 6., 1.]])

In [14]:
# def fit_regression(X, y):
#     A = np.concatenate([X, np.ones((X.shape[0],1))], axis=1)
#     return np.linalg.solve(A.transpose() @ A, A.transpose() @ y)

In [24]:
def fit_regression(df: pd.DataFrame, feature_keys: List[str], y_key: str) -> np.ndarray:
    X = df[feature_keys]
    y = df[y_key]
    A = np.concatenate([X, np.ones((X.shape[0],1))], axis=1)
    return np.linalg.solve(A.transpose() @ A, A.transpose() @ y)

In [31]:
feature_keys = ["F1","F2"]
y_key = "F9"
print(fit_regression(df, ["F1","F2"], "F9"))

[-1.53485667e-03  6.45846770e-04  4.77263933e+01]


### Problem 3c (10 pts)

Write a prediction function that takes the weights of a linear regression, a data point, and produces a prediction.

In [26]:
np.array([[13558.3]])[0]

array([13558.3])

In [27]:
np.transpose(np.array([df["F1"]]))

array([[13558.3 ],
       [ 6191.96],
       [ 7725.98],
       ...,
       [ 7726.65],
       [ 8878.93],
       [12732.4 ]])

In [28]:
def prediction(weights: np.ndarray, x: np.ndarray) -> float:
    prediction = weights[0]
    weightIdx = 1
    for dataPoint in x:
        prediction += weights[weightIdx]
        weightIdx += 1
    return prediction
    pass
prediction(fit_regression(df, ["F1","F2"], "F9"), np.array([[13558.3],[122]]))

47.72550431847543

### Problem 3d (10 pts)

Write a function that computes the root mean square error (rmse) on a data matrix $X$ with rows $X_1, \dots X_n$, and corresponding values $y = (y_1 \dots y_n)$ given a prediction function $f$. The rmse is defined as
$$
\sqrt{\frac{1}{n}\sum_{i=1}^n \lVert f(X_i) - y_i \rVert^2}
$$

In [29]:
def rmse(prediction: Callable[[np.ndarray, np.ndarray], float], X: np.ndarray, y: np.ndarray) -> float:
#     fit_regression(df: pd.DataFrame, feature_keys: List[str], y_key: str) 
    A = np.concatenate([X, np.ones((X.shape[0],1))], axis=1)
    weights = np.linalg.solve(A.transpose() @ A, A.transpose() @ y)
    predictions = []
    n = y.shape[0]
    qsum = 0
    yIdx = 0
    for data in X.values:
        qsum += abs(prediction(weights,data) - y[yIdx])**2 / n
    return np.sqrt(qsum)

In [32]:
print(rmse(prediction, df[feature_keys], df[y_key]))

20.695304318482957
