# SKlearn

## sklearn 기본 구조 

In [1]:
from sklearn import svm
from sklearn.metrics import accuracy_score

# prepare data (XOR)
train_data = [[0, 0], [0, 1], [1, 0], [1, 1]]
train_label = [0, 1, 1, 0]
test_data = [[0, 0], [0, 1], [1, 0], [1, 1]]

# select a learning algorithm
clf = svm.SVC(C=10, gamma=0.1) # C:오류허용도의 역수, gamma:학습data 의존도

# learning
clf.fit(train_data, train_label)

# testing
test_label = clf.predict(test_data)

# results
print(f"train_data = {train_data}, train_label = {train_label}")
print(f"test_data = {test_data}")
print(f"accuracy = {accuracy_score(train_label, test_label)}")

train_data = [[0, 0], [0, 1], [1, 0], [1, 1]], train_label = [0, 1, 1, 0]
test_data = [[0, 0], [0, 1], [1, 0], [1, 1]]
accuracy = 1.0


---

##  Preprocessing 모듈 

   ### 1.  StandardSclaer

#### 평균을 제거하고 데이터를 단위 분산으로 조정

In [2]:
from sklearn import preprocessing
import numpy as np
data = np.array([[1., -1., 2.],
                 [2., 0., 0.],
                 [0., 1., -1.]])
print(data)
print("")
scaler = preprocessing.StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)
print(f"mean={scaled_data.mean(axis=0)}, std={scaled_data.std(axis=0)}")
print("")
print(scaled_data)
print("")

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]]

mean=[0. 0. 0.], std=[1. 1. 1.]

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]



### 2. MinMaxScaler 

#### 모든 feature 값이 0~1사이에 있도록 데이터를 재조정

In [3]:
from sklearn import preprocessing
import numpy as np

data = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
print(data)
print("")

scaler = preprocessing.MinMaxScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)

print(f"mean = {scaled_data.mean(axis=0)}, std = {scaled_data.std(axis=0)}")
print("")

print(scaled_data)
print("")

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]]

mean = [0.5        0.5        0.44444444], std = [0.40824829 0.40824829 0.41573971]

[[0.5        0.         1.        ]
 [1.         0.5        0.33333333]
 [0.         1.         0.        ]]



### 3. RobutScaler

#### 아웃라이어의 영향을 최소화한 기법

In [4]:
from sklearn import preprocessing
import numpy as np

data = np.array([[ 1., -1., 2.],
[ 2., 0., 0.],
[ 0., 1., -1.]])
print(data)
print("")

scaler = preprocessing.RobustScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)
print(f"mean = {scaled_data.mean(axis=0)}, std = {scaled_data.std(axis=0)}")
print("")

print(scaled_data)
print("")

[[ 1. -1.  2.]
 [ 2.  0.  0.]
 [ 0.  1. -1.]]

mean = [0.         0.         0.22222222], std = [0.81649658 0.81649658 0.83147942]

[[ 0.         -1.          1.33333333]
 [ 1.          0.          0.        ]
 [-1.          1.         -0.66666667]]



---

## metrics 모듈 

### 1. r2_score

#### 일반적으로 R**2로 표시되는 계수(즉, y의 분산비율)를 계산

In [5]:
from sklearn.metrics import r2_score

y_true = [0, 1, 2, 3]
y_pred = [0, 2, 1, 3]
score = r2_score(y_true, y_pred)

print(score) # 0.6


0.6


### 2. explained_variance_score

#### 일반적으로 분산의 차이를 계산(r2_score와 비슷

In [6]:
from sklearn.metrics import explained_variance_score

y_true = [0, 1, 2, 3]
y_pred = [0, 2, 1, 3]
score = explained_variance_score(y_true, y_pred)
print(score) # 0.6

0.6


### 3. accuracy_score

####  분류의 정확도를 계산, 예측된 값 y_pred가 y_true와 정확히 일치하는 개수의 비율 계산

In [7]:
from sklearn.metrics import accuracy_score

y_true = [0, 1, 2, 3]
y_pred = [0, 2, 1, 3]
accuracy = accuracy_score(y_true, y_pred)
print(accuracy) # 0.5

accuracy = accuracy_score(y_true, y_pred, normalize=False)
print(accuracy) # 2

0.5
2


---

## pipeline 모듈 

### 4. Pipeline(steps)

#### steps: (key, value)를 요소로 하는 리스트를 지정


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

pipe = Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())])
print(pipe)

Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())])


### 5. make_pipeline(steps)

####  steps: 각 단계를 수행하는 객체

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

pipe = make_pipeline(PCA(), SVC())
print(pipe)

Pipeline(steps=[('pca', PCA()), ('svc', SVC())])
