In [1]:
# %load my_init.py
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import load_digits
from matplotlib import font_manager
from matplotlib import gridspec
from scipy import misc
from sklearn.datasets import fetch_olivetti_faces



# import warnings
# warnings.filterwarnings('ignore')

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()

plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False


%matplotlib inline

In [2]:
# numpy의 함수가 훨신빠름
A = (np.arange(9) - 4).reshape((3,3))
A

array([[-4, -3, -2],
       [-1,  0,  1],
       [ 2,  3,  4]])

In [3]:
# norm: 함수의 제곱합
np.linalg.norm(A)

7.745966692414834

In [4]:
# 대각합
np.trace(A)

0

In [5]:
np.trace(np.eye(3))

3.0

In [6]:
A = np.arange(9).reshape((3,3))
np.linalg.det(A)

0.0

In [7]:
A = np.array([[1,1,0],[0,1,1],[1,1,1]])
print(np.linalg.det(A))
A

1.0


array([[1, 1, 0],
       [0, 1, 1],
       [1, 1, 1]])

In [8]:
Ainiv = np.linalg.inv(A)
Ainiv

array([[ 0., -1.,  1.],
       [ 1.,  1., -1.],
       [-1.,  0.,  1.]])

In [9]:
B = np.array([[2,],[2],[3]])
B

array([[2],
       [2],
       [3]])

In [10]:
x = Ainiv @ B
x

array([[1.],
       [1.],
       [1.]])

In [11]:
A@x - B

array([[0.],
       [0.],
       [0.]])

In [12]:
# lstsq: 해의 근사치를 구해줌.
x= np.linalg.lstsq(A, B)[0]    # X의 해(근사값), 잔차제곱합, 축의정보, 특이값
x

  x= np.linalg.lstsq(A, B)[0]    # X의 해(근사값), 잔차제곱합, 축의정보, 특이값


array([[1.],
       [1.],
       [1.]])

### 근사값을 구하는 이유는 미지수의 수보다 방정식이 더 많으면 해가 없기때문에 근사값을 구한다.
### 항상 행렬의 모양이 정방행렬이 아니기 때문에 
### 전치행렬을 도입해 의사역행렬을 시용하여 정방행렬로 만들어 사용한다. 

In [13]:
#정사각이 아닌 행렬, 사용하는 데이터
A = np.array([[1,1,0],[0,1,1],[1,1,1],[1,1,2]])
A

array([[1, 1, 0],
       [0, 1, 1],
       [1, 1, 1],
       [1, 1, 2]])

In [14]:
#열백터 데이터값
b = np.array([[2],[2],[3],[4.1]])
b

array([[2. ],
       [2. ],
       [3. ],
       [4.1]])

In [15]:
# 의사 역행렬 (A.T x A).T x A를 구하기
# np.linalg.inv : .T와 같은 역할 괄호에서 쓸수 없어서
Apinv = np.linalg.inv(A.T @ A) @ A.T
Apinv

array([[ 0.33333333, -1.        ,  0.33333333,  0.33333333],
       [ 0.5       ,  1.        ,  0.        , -0.5       ],
       [-0.5       ,  0.        ,  0.        ,  0.5       ]])

In [16]:
#해의 근사치
x = Apinv @ b
x

array([[1.03333333],
       [0.95      ],
       [1.05      ]])

In [17]:
#실제 값과 약간의 오차가 있다.
print(b)
A @ x

[[2. ]
 [2. ]
 [3. ]
 [4.1]]


array([[1.98333333],
       [2.        ],
       [3.03333333],
       [4.08333333]])

In [18]:
# X의 해(근사값), 잔차제곱합, 축의정보, 특이값
x, resid, rank, s = np.linalg.lstsq(A, b)
x

  x, resid, rank, s = np.linalg.lstsq(A, b)


array([[1.03333333],
       [0.95      ],
       [1.05      ]])

In [19]:
# resid은 잔차백터의 제곱합 즉, 노름의 제곱임. 결국 구한 근사값의 오차를 보여준다.
print(resid)
print(np.linalg.norm(A @ x - b) ** 2)

[0.00166667]
0.001666666666666655


## CRIM 범죄율
### per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.  

## CHAS 강변근처 
### Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

## RM 주택의 방의 수
### average number of rooms per dwelling

## AGE 평균 나이
### proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

MEDV - Median value of owner-occupied homes in $1000's

In [20]:
housing_df = pd.read_csv('BostonHousing.csv')
housing_df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [21]:
housing_df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [22]:
# 변수 특성 variable feature
#반응변수(종속변수) 변수 실데이터수 데이터타입 
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


### 행렬*가중치 = 근사값 ~= 실제값  가중치는 학습을 통해 만든다
### Xw = y.hat ~= y
### 오차 = 실제값 - 근사값
### e = y - y.hat

In [23]:
X_df = housing_df.iloc[:, 0:-1]
X_df.head(3)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03


In [24]:
y_ser = housing_df.iloc[:,-1]
y_ser

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: medv, Length: 506, dtype: float64

In [25]:
X = X_df.to_numpy()
y = y_ser.to_numpy()
X

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [26]:
w, resid, *_ = np.linalg.lstsq(X, y)

  w, resid, *_ = np.linalg.lstsq(X, y)


In [27]:
w

array([-9.28965170e-02,  4.87149552e-02, -4.05997958e-03,  2.85399882e+00,
       -2.86843637e+00,  5.92814778e+00, -7.26933458e-03, -9.68514157e-01,
        1.71151128e-01, -9.39621540e-03, -3.92190926e-01,  1.49056102e-02,
       -4.16304471e-01])

In [28]:
resid

array([12228.04626104])

In [29]:
X_df.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat'],
      dtype='object')

#### 예측 주택가격 = -9.28 x crim + 4.87 x zn ~  주텍가격은 범죄율에 반비례한다는 뜻. 등등
#### 앞의 값이 +이면 주텍가격에 상승요인, -이면 감소원인
#### 크기에 따라서 정도차이가 있다. 
#### Xw -> 모든X(변수)값과 모든w(가중치)를 간단히 정리한것. 