In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2)

### Projection onto a column space
- 서로 다른 그룹(그룹1, 그룹2)의 데이터로 matrix A, B를 만든다.
- matrix A, B의 column space에 대한 projection을 수행할 수 있는 matrix를 각각 만든다.
- 임의의 데이터를 C(A), C(B)로 projection을 진행해보고, projection의 결과 및 error vector를 살펴본다. 
- Orthogonal projection을 통해 임의의 데이터가 어떠한 column space에 보다 가까운 지를 이해한다. 

In [2]:
df = pd.read_csv('hufs_sleep_anon.csv')
df = df.iloc[:,[2,3,4,5,6,7,8]]
df

Unnamed: 0,식사,평균수면시간,평균취침시각,평균기상시각,평균깨는횟수,평균꿈빈도,이름
0,3,6,26,8,3,1,A01
1,2,6,25,9,4,1,A02
2,2,6,25,8,1,1,A03
3,2,6,27,9,4,1,A04
4,3,6,25,9,1,1,A05
5,1,6,26,8,2,1,A06
6,3,6,27,10,4,1,A07
7,3,7,25,8,2,1,A08
8,3,7,26,9,1,1,A09
9,2,7,26,9,2,1,A10


In [3]:
#G1 = df.sample(5)
G1 = df.iloc[:5,:]
G1

Unnamed: 0,식사,평균수면시간,평균취침시각,평균기상시각,평균깨는횟수,평균꿈빈도,이름
0,3,6,26,8,3,1,A01
1,2,6,25,9,4,1,A02
2,2,6,25,8,1,1,A03
3,2,6,27,9,4,1,A04
4,3,6,25,9,1,1,A05


In [4]:
#G2 = df.sample(5)
G2 = df.iloc[-5:,:]
G2

Unnamed: 0,식사,평균수면시간,평균취침시각,평균기상시각,평균깨는횟수,평균꿈빈도,이름
43,2,6,26,8,3,5,A44
44,2,6,26,10,3,5,A45
45,2,6,27,9,2,5,A46
46,2,7,25,9,3,5,A47
47,2,8,27,10,1,5,A48


In [5]:
A=G1.iloc[:,:-1].to_numpy().T
A_name = G1.iloc[:,-1].to_numpy().T
print(A.shape)
print(A_name)
A

(6, 5)
['A01' 'A02' 'A03' 'A04' 'A05']


array([[ 3,  2,  2,  2,  3],
       [ 6,  6,  6,  6,  6],
       [26, 25, 25, 27, 25],
       [ 8,  9,  8,  9,  9],
       [ 3,  4,  1,  4,  1],
       [ 1,  1,  1,  1,  1]])

In [6]:
B = G2.iloc[:,:-1].to_numpy().T
B_name = G2.iloc[:,-1].to_numpy().T
print(B.shape)
print(B_name)
B

(6, 5)
['A44' 'A45' 'A46' 'A47' 'A48']


array([[ 2,  2,  2,  2,  2],
       [ 6,  6,  6,  7,  8],
       [26, 26, 27, 25, 27],
       [ 8, 10,  9,  9, 10],
       [ 3,  3,  2,  3,  1],
       [ 5,  5,  5,  5,  5]])

In [7]:
print(f"rank of A: {np.linalg.matrix_rank(A)}")
print(f"rank of B: {np.linalg.matrix_rank(B)}")

rank of A: 5
rank of B: 5


In [8]:
b = A[:,1]

xa = (np.dot(np.dot(np.linalg.inv(np.dot(A.T,A)),A.T),b))
print(A_name)
print(xa)

xb = (np.dot(np.dot(np.linalg.inv(np.dot(B.T,B)),B.T),b))
print(B_name)
print(xb)


['A01' 'A02' 'A03' 'A04' 'A05']
[-2.89e-13  1.00e+00  7.34e-13 -1.40e-12 -5.85e-13]
['A44' 'A45' 'A46' 'A47' 'A48']
[ 44.79  26.86 -51.34 -44.14  24.14]


In [9]:

Pa = np.dot(A,np.dot(np.linalg.inv(np.dot(A.T,A)),A.T))
Pb = np.dot(B,np.dot(np.linalg.inv(np.dot(B.T,B)),B.T))

pa = (np.dot(Pa,b))
pb = (np.dot(Pb,b))

pa_norm = np.linalg.norm(pa)
pb_norm = np.linalg.norm(pb)

ea = b - pa
eb = b - pb

ea_norm = np.linalg.norm(b - pa)
eb_norm = np.linalg.norm(b - pb)

print(f"vector b: {b}")
print(f"vector b's projection onto C(A) : {pa}, norm of projection: {pa_norm:.2f}, norm of error : {ea_norm:.2f} ")
print(f"vector b's projection onto C(B) : {pb}, norm of projection: {pb_norm:.2f}, norm of error : {eb_norm:.2f} ")

vector b: [ 2  6 25  9  4  1]
vector b's projection onto C(A) : [ 2.  6. 25.  9.  4.  1.], norm of projection: 27.62, norm of error : 0.00 
vector b's projection onto C(B) : [ 0.62  6.   25.    9.    4.    1.55], norm of projection: 27.58, norm of error : 1.49 


In [10]:
G3 = df.sample(4)
#G3 = df.iloc[[10,20,30,40],:]
G3

Unnamed: 0,식사,평균수면시간,평균취침시각,평균기상시각,평균깨는횟수,평균꿈빈도,이름
43,2,6,26,8,3,5,A44
47,2,8,27,10,1,5,A48
36,2,7,27,10,2,3,A37
3,2,6,27,9,4,1,A04


In [11]:
C = G3.iloc[:,:-1].to_numpy().T
C_name = G3.iloc[:,-1].to_numpy().T
print(C.shape)
print(C_name)
C

(6, 4)
['A44' 'A48' 'A37' 'A04']


array([[ 2,  2,  2,  2],
       [ 6,  8,  7,  6],
       [26, 27, 27, 27],
       [ 8, 10, 10,  9],
       [ 3,  1,  2,  4],
       [ 5,  5,  3,  1]])

In [12]:
for n in range(len(C_name)):
    
    print(C_name[n])
    b = C[:,n]

    pa = (np.dot(Pa,b))
    pb = (np.dot(Pb,b))

    pa_norm = np.linalg.norm(pa)
    pb_norm = np.linalg.norm(pb)

    ea = b - pa
    eb = b - pb

    ea_norm = np.linalg.norm(b - pa)
    eb_norm = np.linalg.norm(b - pb)

    print(f"vector b: {b}")
    print(f"vector b's projection onto C(A) : {pa}, norm of projection: {pa_norm:.2f}, norm of error : {ea_norm:.2f} ")
    print(f"vector b's projection onto C(B) : {pb}, norm of projection: {pb_norm:.2f}, norm of error : {eb_norm:.2f} ")
    if ea_norm<eb_norm:
        print(f"{C_name[n]}은 group 1과 조금 더 가깝습니다.")
    else:
        print(f"{C_name[n]}은 group 2와 좀 더 가깝습니다.")
    print("\n")



A44
vector b: [ 2  6 26  8  3  5]
vector b's projection onto C(A) : [ 2.    6.65 26.    8.    3.    1.11], norm of projection: 28.26, norm of error : 3.95 
vector b's projection onto C(B) : [ 2.  6. 26.  8.  3.  5.], norm of projection: 28.53, norm of error : 0.00 
A44은 group 2와 좀 더 가깝습니다.


A48
vector b: [ 2  8 27 10  1  5]
vector b's projection onto C(A) : [ 2.    8.59 27.   10.    1.    1.43], norm of projection: 30.16, norm of error : 3.62 
vector b's projection onto C(B) : [ 2.  8. 27. 10.  1.  5.], norm of projection: 30.38, norm of error : 0.00 
A48은 group 2와 좀 더 가깝습니다.


A37
vector b: [ 2  7 27 10  2  3]
vector b's projection onto C(A) : [ 2.    7.3  27.   10.    2.    1.22], norm of projection: 29.86, norm of error : 1.81 
vector b's projection onto C(B) : [ 1.31  7.   27.   10.    2.    3.28], norm of projection: 29.91, norm of error : 0.74 
A37은 group 2와 좀 더 가깝습니다.


A04
vector b: [ 2  6 27  9  4  1]
vector b's projection onto C(A) : [ 2.  6. 27.  9.  4.  1.], norm of projec