# 집값 예측 (linear regression)
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my

## 데이터 준비

In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
type(housing)

sklearn.utils.Bunch

In [3]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [34]:
df = pd.DataFrame(housing['data'], columns=housing['feature_names'])
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [9]:
df.shape

(20640, 8)

In [10]:
# housing.DESCR

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [14]:
df.duplicated().sum()
# 중복치

0

In [35]:
df['target'] = housing['target']

In [36]:
## X, y
X = df[['MedInc', 'HouseAge', 'AveRooms']]
y = df['target']

1.
- 테스트 데이터 분리
- 스케일링 x
- 학습
- RMSE (X_test)

2.
- 스케일링
- 학습
- RMSE (X_test_

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=2022)
X_train.shape, X_test.shape

((16512, 3), (4128, 3))

## 스케일링

In [67]:
## 스케일링 되어있는 데이터 사용
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values
y_test = y_test.values

X_train.shape, X_test.shape

((16512, 3), (4128, 3))

In [68]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [69]:
y_pred = lr.predict(X_train)
y_pred[:5]

array([1.15405379, 1.03787517, 1.33731768, 2.04956268, 1.02018779])

In [70]:
X_train = np.array(X_train)

In [71]:
X_train.shape

(16512, 3)

In [72]:
sns.scatterplot(x=X_train.reshape(-1), y=y_train)
plt.plot(X_train.reshape(-1), y_pred, 'r')
plt.show

ValueError: All arrays must be of the same length

In [73]:
from sklearn.model_selection import cross_val_score

mse = cross_val_score(lr, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse
# mse를 matrix로 사용할 때, 높을 수록 좋게 만드려면 neg(네거티브)를 붙여야 함

array([-0.63879692, -0.6664322 , -0.66737045])

In [74]:
np.mean(np.sqrt(-mse))

0.81084271924912

### 분류
score -> 정확도
- ex) svm.score(X_train, y_train)

### 회귀
score -> R^2 ... 0~1
- ex) lr.score(X_train, y_train)

## 결정트리

In [75]:
from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor()
dtree.fit(X_train, y_train)

DecisionTreeRegressor()

In [82]:
dtree.score(X_train, y_train)

1.0

In [83]:
y_pred = dtree.predict(X_test)

In [86]:
dtree.score(X_test, y_test)

0.12564752576555283

## RandomForest

In [76]:
from sklearn.ensemble import RandomForestRegressor

In [77]:
ranfor = RandomForestRegressor()
ranfor.fit(X_train, y_train)

RandomForestRegressor()

In [87]:
ranfor.score(X_train, y_train)

0.9384379967345158

In [88]:
y_pred = ranfor.predict(X_test)

In [89]:
dtree.score(X_test, y_test)

0.12564752576555283

## Support Vector Machine

In [60]:
from sklearn.svm import SVR

In [None]:
# sklearn.linear_model.Lasso
# sklearn.linear_model.Ridge

In [81]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

names = ['linear', 'poly', 'rbf']

models = [
    SVR(kernel='linear', C=1),
    SVR(kernel='poly', degree=3),
    SVR(kernel='rbf', C=1, gamma=0.7)
]

scores = {}

for name, model in zip(names, models):
    model.fit(X_train, y_train)
    s = model.score(X_train, y_train)
    print(name, s)
    scores[name] = s

linear 0.49202727573849303
poly 0.19061224041512959
rbf 0.5970387846665973


In [90]:
gs.best_params_

NameError: name 'gs' is not defined

In [None]:
clf = SVC(C=10, gamma=1, kernel='rbf', random_state=2022)
clf.fit(X_train, y_train)