# Library

In [26]:
# 시각화 패키지
!pip install graphviz




In [50]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingClassifier, BaggingRegressor
import graphviz
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Tree

## Decision Tree

<img src="https://scikit-learn.org/stable/_images/sphx_glr_plot_iris_dtc_002.png" width="600" height="400"/>

데이터의 특성을 기반으로 의사 결정을 시각적으로 표현하는 트리 구조의 알고리즘 <br>
-> 데이터의 특징을 가지치기를 통해 분류 또는 회귀 <br>
노드가 나무 가지처럼 연결된 **비선형** 계층적 자료구조 <br>  
트리 내 다른 하위 트리가 있고 그 하위 트리에는 또 다른 하위 트리가 존재하는 **재귀적 구조** <br>

### 용어

<img src="https://velog.velcdn.com/images/kwontae1313/post/017e5691-6908-4684-ab5c-85dee4638720/image.png" width="600" height="300"/>

- 노드(Node): 트리를 구성하는 기본 요소
- 간선(Edge): 노드 간 연결선
- 루트 노드(Root Node): 부모가 없는 최상위 노드
- 부모 노드(Parent Node): 자식 노드를 가진 노드
- 자식 노드(Child Node): 부모 노드의 하위 노드
- 형제 노드(Siblingg Node): 같은 부모를 가지는 노드
- 리프 노드(Leaf Node): 자식 노드가 없는 노드
- 깊이(Depth): 루트 노드에서 특정 노드까지의 간선 수
- 높이(Height): 어떤 로드에서 리프 노드까지 가장 긴 경로의 간선 수
- 경로(Path): 한 노드에서 다른 노드로 가는 길 사이의 순서

### 학습 방법

#### Entropy

classification에서 사용. <br>
무질서를 나타내는 정도로 entropy 값이 높으면 무질서 정도가 높고 값이 낮으면 무질서 정도가 낮음. <br>
엔트로피가 최소화 되는 방향으로 모델 학습. <br>

<img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/1*zMu0UClotNXljrjqmyRIHA.png" width="600" height="400"/>


#### regression

클래스를 잘 분류할 수 있는 조건(선)을 그어가며 학습 <br>
회귀의 경우도 특정 조건 하에서 어떤 값이 나올 수 있도록 학습 <br>

<img src="https://tensorflow.blog/wp-content/uploads/2017/06/2-24.png" width="600" height="300"/>

<img src="https://tensorflow.blog/wp-content/uploads/2017/06/2-25.png" width="600" height="300"/>

<img src="https://tensorflow.blog/wp-content/uploads/2017/06/2-26.png" width="600" height="300"/>

<img src="https://scikit-learn.org/stable/_images/sphx_glr_plot_tree_regression_001.png" width="600" height="300"/>

### 장단점

<font style="font-size:20px"> 장점 </font>
- 직관적이고 해석이 쉬움: 데이터의 분할 과정을 쉽게 추적하고 설명 가능
- 비선형 데이터 처리: 데이터가 복잡한 비선형 관계를 가지더라도 적절한 특성 분할을 통해 모델링이 가능
- 특성 스케일링 필요 없음
- 다양한 데이터 유형 지원: 연속형 데이터와 범주형 데이터 모두 지원
- 자동 특성 선택: 중요한 특성을 자동으로 선택하고, 덜 중요한 특성은 제외

<br>

<font style="font-size:20px"> 단점 </font>
- 과적합(Overfitting): 훈련 데이터에 지나치게 적합되어 새로운 데이터에 대한 일반화 성능이 떨어질 수 있음
- 불안정성: 결정 트리는 데이터의 작은 변화에 민감할 수 있습니다. 작은 데이터 변경이 트리 구조를 크게 변경할 수 있습니다. 이를 개선하기 위해 앙상블 기법(예: 랜덤 포레스트, 부스팅 등)을 사용할 수 있습니다.
- 비선형 데이터의 복잡한 분할: 데이터가 매우 복잡하거나 특성이 많을 경우, 너무 많은 분할을 생성하여 모델 해석이 어려워질 수 있음
- 매끄럽지 않은 결정 경계: 결정 트리는 직선적인 경계로 데이터를 분할하므로, 결정 경계가 부드럽지 않을 수 있음. 이로 인해 경계가 데이터의 실제 분포를 잘 반영하지 못할 가능성 존재.

### 사용 방법

> ```python
> from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
> 
> decision_tree = DecisionTreeRegressor(
>   n_estimators=100,
> )
> decision_tree = DecisionTreeClassifier(
>   n_estimators=100,
> )
> 
> decision_tree.fit(X, y)    # train
> decision_tree.predict(X)   # predict
> 
> ```

주요 파라미터
- max_depth: 각 tree의 최대 깊이
- min_sample_split: 분기에 필요한 최소 데이터 수
- max_features: 분기 시 고려할 최대 feature 수
- random_state: 난수 고정을 위한 seed

In [2]:
penguins = sns.load_dataset('penguins').drop(columns=['island'])
penguins = penguins.dropna().drop_duplicates().reset_index(drop=True)
penguins


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,39.1,18.7,181.0,3750.0,Male
1,Adelie,39.5,17.4,186.0,3800.0,Female
2,Adelie,40.3,18.0,195.0,3250.0,Female
3,Adelie,36.7,19.3,193.0,3450.0,Female
4,Adelie,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...
328,Gentoo,47.2,13.7,214.0,4925.0,Female
329,Gentoo,46.8,14.3,215.0,4850.0,Female
330,Gentoo,50.4,15.7,222.0,5750.0,Male
331,Gentoo,45.2,14.8,212.0,5200.0,Female


In [3]:
species_to_index = dict(zip(penguins.species.unique(), range(penguins.species.nunique())))
sex_to_index = dict(zip(penguins.sex.unique(), range(penguins.sex.nunique())))

In [4]:
penguins.species = penguins.species.apply(lambda x : species_to_index.get(x))
penguins.sex = penguins.sex.apply(lambda x : sex_to_index.get(x))

In [5]:
penguins

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,39.1,18.7,181.0,3750.0,0
1,0,39.5,17.4,186.0,3800.0,1
2,0,40.3,18.0,195.0,3250.0,1
3,0,36.7,19.3,193.0,3450.0,1
4,0,39.3,20.6,190.0,3650.0,0
...,...,...,...,...,...,...
328,2,47.2,13.7,214.0,4925.0,1
329,2,46.8,14.3,215.0,4850.0,1
330,2,50.4,15.7,222.0,5750.0,0
331,2,45.2,14.8,212.0,5200.0,1


In [6]:
train, test = train_test_split(penguins, test_size= 0.3, random_state=0)

In [7]:
tree = DecisionTreeClassifier()
tree.fit(train.drop(columns=['species']), train.species)

In [8]:
print(f'train_acc: {(tree.predict(train.drop(columns=['species'])) == train.species).mean()}')
print(f'test_acc: {(tree.predict(test.drop(columns=['species'])) == test.species).mean()}')

train_acc: 1.0
test_acc: 0.98


In [9]:
idx_to_species = {value: key for key, value in species_to_index.items()}
idx_to_species

{0: 'Adelie', 1: 'Chinstrap', 2: 'Gentoo'}

In [10]:
tree = DecisionTreeClassifier(min_samples_split=10)
tree.fit(train.drop(columns=['species']), train.species)

print(f'train_acc: {(tree.predict(train.drop(columns=['species'])) == train.species).mean()}')
print(f'test_acc: {(tree.predict(test.drop(columns=['species'])) == test.species).mean()}')

train_acc: 0.9742489270386266
test_acc: 0.95


In [11]:
tree_graphviz = export_graphviz(
    tree,
    
    feature_names= train.drop(columns=['species']).columns,
    class_names=[idx_to_species.get(species) for species in train.species]
)

In [12]:
graph = graphviz.Source(tree_graphviz)
graph.render('decision_tree')
graph.view()

'decision_tree.pdf'

In [14]:
diamonds = sns.load_dataset('diamonds')
diamonds = diamonds.query('`carat` < 0.3')
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
52966,0.24,Ideal,F,VVS2,61.1,57.0,552,4.03,4.06,2.47
52967,0.24,Ideal,F,VVS2,62.6,57.0,552,3.93,3.96,2.47
52968,0.24,Ideal,F,VVS2,61.8,56.0,552,3.97,4.02,2.47
52969,0.24,Ideal,F,VVS2,61.6,54.0,552,4.00,4.09,2.49


In [15]:
cut_to_idx = dict(zip(diamonds.cut.unique(), range(diamonds.cut.nunique())))
color_to_idx = dict(zip(diamonds.color.unique(), range(diamonds.color.nunique())))
clarity_to_idx = dict(zip(diamonds.clarity.unique(), range(diamonds.clarity.nunique())))

In [16]:
diamonds.cut = diamonds.cut.apply(lambda x : cut_to_idx.get(x))
diamonds.color = diamonds.color.apply(lambda x : color_to_idx.get(x))
diamonds.clarity = diamonds.clarity.apply(lambda x : clarity_to_idx.get(x))

In [17]:
diamonds = diamonds.dropna().drop_duplicates().reset_index(drop=True)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,0,0,0.0,61.5,55.0,326,3.95,3.98,2.43
1,0.21,1,0,1.0,59.8,61.0,326,3.89,3.84,2.31
2,0.23,2,0,2.0,56.9,65.0,327,4.05,4.07,2.31
3,0.29,1,1,3.0,62.4,58.0,334,4.20,4.23,2.63
4,0.24,3,2,4.0,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
1594,0.24,0,4,4.0,61.1,57.0,552,4.03,4.06,2.47
1595,0.24,0,4,4.0,62.6,57.0,552,3.93,3.96,2.47
1596,0.24,0,4,4.0,61.8,56.0,552,3.97,4.02,2.47
1597,0.24,0,4,4.0,61.6,54.0,552,4.00,4.09,2.49


In [18]:
train, test = train_test_split(diamonds, test_size= 0.3, random_state=0)

tree = DecisionTreeClassifier(random_state=0)
tree.fit(train.drop(columns=['cut']), train.cut)

In [19]:
print(f'train_acc: {(tree.predict(train.drop(columns=['cut'])) == train.cut).mean()}')
print(f'test_acc: {(tree.predict(test.drop(columns=['cut'])) == test.cut).mean()}')

train_acc: 1.0
test_acc: 0.7666666666666667


In [20]:
idx_to_cut = {value: key for key, value in cut_to_idx.items()}
idx_to_cut

tree_graphviz = export_graphviz(
    tree,
   
    feature_names= train.drop(columns=['cut']).columns,
    class_names=[idx_to_cut.get(cut) for cut in train.cut]
)

graph = graphviz.Source(tree_graphviz)
graph.render('decision_tree')
graph.view()

'decision_tree.pdf'

In [None]:
DecisionTreeClassifier(max_depth=, max_features=, max_leaf_nodes=)

In [30]:
from sklearn.model_selection import RandomizedSearchCV
tree = DecisionTreeClassifier(random_state=0)
# from scipy.stats import uniform
parameters = {'max_depth': range(5, 51), 'max_leaf_nodes': range(1, 30)}
grid_search = RandomizedSearchCV(tree, parameters, random_state=0)
grid_search.fit(diamonds.drop(columns=['cut']), diamonds.cut)

In [31]:
grid_search.best_params_

{'max_leaf_nodes': 9, 'max_depth': 24}

In [36]:
index_for_best_score = grid_search.cv_results_.get('rank_test_score').argmin()
grid_search.cv_results_.get('params')[index_for_best_score]

{'max_leaf_nodes': 9, 'max_depth': 24}

In [37]:
train, test = train_test_split(diamonds, test_size= 0.3, random_state=0)

tree = DecisionTreeClassifier(random_state=0, max_leaf_nodes = 28, max_depth = 46)
tree.fit(train.drop(columns=['cut']), train.cut)

print(f'train_acc: {(tree.predict(train.drop(columns=['cut'])) == train.cut).mean()}')
print(f'test_acc: {(tree.predict(test.drop(columns=['cut'])) == test.cut).mean()}')

train_acc: 0.7899910634495085
test_acc: 0.7166666666666667


In [29]:
idx_to_cut = {value: key for key, value in cut_to_idx.items()}
idx_to_cut

tree_graphviz = export_graphviz(
    tree,
   
    feature_names= train.drop(columns=['cut']).columns,
    class_names=[idx_to_cut.get(cut) for cut in train.cut]
)

graph = graphviz.Source(tree_graphviz)
graph.render('decision_tree')
graph.view()

'decision_tree.pdf'

In [47]:
bagging_classifier = BaggingClassifier(
    estimator= DecisionTreeClassifier(random_state= 0),
)

bagging_classifier.fit(train.drop(columns=['cut']), train.cut)


In [48]:
print(f'train_acc: {(bagging_classifier.predict(train.drop(columns=['cut'])) == train.cut).mean()}')
print(f'test_acc: {(bagging_classifier.predict(test.drop(columns=['cut'])) == test.cut).mean()}')

train_acc: 0.9839142091152815
test_acc: 0.7520833333333333


## Ensemble

<img src="https://cdn.prod.website-files.com/5d7b77b063a9066d83e1209c/61f7bbd4e90cce440b88ea32_ensemble-learning.png" width="600" height="300"/>

<br>

두 개 이상의 모델을 결합하여 task를 수행하는 기법 <br> 
오차의 분산을 줄이기에 개별 모델보다 강건함(robust) <br>
    -> 한 모델에서 포착하지 못했던 정보를 다른 모델에서 포착할 수 있음 <br>
따라서 모델 간 다양성이 클수록 ensemble 모델의 효과가 커짐 <br>

### Bagging

<img src="https://cdn.prod.website-files.com/5d7b77b063a9066d83e1209c/61a4414d28946a3ac3e69ed9_q-FrlRMLk-5nSxZ_3ONlFpu5hQ61PsuAxkusTD1vEX5NqkdH2Ie0u_75rIySTZKXVI4VBxM-AIw3APQvRboG3kv-3l3cA5c5qyMwwTMe2OLXzoAgA051Dqbx7XVfdJaDyNwrSLUf.png" width="600" height="300"/>

<br>

Boostrap AGGregatING의 약자로 부트스트랩을 통한 방법 <br>
전체 집합에서 일부 데이터를 선택하여 subset을 만든 후(복원추출) 개별 모델을 학습 <br>
-> subset을 여러 개 만들어 동시에 학습할 수 있기에 병렬화 가능
최종 예측은 평균 등의 방법으로 산출 <br>
이 때 사용되는 모델은 **모두 동일한 모델**이어야 함

<br>

<font style="font-size:20px"> 대표 알고리즘 </font> <p>
- Random Forest

<br>

<font style="font-size:20px"> 사용 방법 </font> <p>

> ```python
> from sklearn.ensemble import BaggingRegressor
> 
> bagging_regressor = BaggingRegressor(
>   estimator=model,
> )
> 
> bagging_regressor.fit(X, y)    # train
> bagging_regressor.predict(X)   # predict
> ```

<br>

주요 파라미터
- estimator: base model
- n_estimators: base learner의 수
- random_state: 난수 고정을 위한 seed

#### Random Forest

<img src="https://blog.kakaocdn.net/dn/s8SNC/btrn82AaWBS/jSX1Yf8l9kANu6gu4HXv5K/img.gif" width="600" height="300"/>

<br>

Decision Tree를 기본 모델로 사용하는 Bagging 방법. <br>
각 트리는 비교적 예측을 잘 할 수 있지만 데이터의 일부에 과대적합하는 경향을 가지는데, 이러한 트리를 많이 만들면 그 결과를 평균냄으로써 과대적합된 양을 줄일 수 있다는 아이디어에 기반. <br>
데이터 포인트를 무작위로 선택하는 방법과 분할 테스트에서 특성을 무작위로 선택하는 방법 등으로 개별 트리 구성.

##### 장단점

<font style="font-size:20px"> 장점 </font>
- 과적합 방지: 여러 개의 결정 트리를 앙상블하여 예측을 수행하므로, 개별 트리의 과적합 문제를 줄이고 더 일반화된 모델을 생성
- 정확도: 트리의 서로 다른 부분집합을 학습하여 더 정확한 예측
- 노이즈와 이상치 처리: 개별 트리가 이상치에 민감하더라도 앙상블된 모델은 이러한 영향을 줄일 수 있음
- 특성 중요도 평가: 각 특성이 모델의 예측에 기여하는 정도를 평가 가능
- 비선형 데이터 처리: 랜덤 포레스트는 데이터 간의 비선형 관계를 잘 모델링
- 부트스트랩 샘플링: 각 트리는 훈련 데이터의 무작위 샘플을 사용하여 학습하므로, 데이터의 전체를 효과적으로 활용

<font style="font-size:20px"> 단점 </font>
- 해석의 어려움: 개별 트리의 예측을 해석하기 어려울 수 있으며, 이로 전체 모델의 예측을 설명하는 것도 복잡할 수 있음
- 메모리 및 계산 비용: 많은 트리를 생성하고 앙상블을 유지하기 때문에 메모리와 계산 자원이 많이 소모될 수 있음
- 예측 시간: 각 트리를 모두 통과해야 하므로 실시간 예측에는 다소 비효율적일 수 있음
- 과적합: 매우 큰 데이터셋이나 트리가 매우 깊은 경우, 과적합의 위험이 여전히 존재
- 하이퍼파라미터에 의한 영향: 기본적으로 잘 동작하나, 최적의 성능을 위해 하이퍼파라미터 조정이 필요할 수 있음


##### 사용 방법

> ```python
> from sklearn.ensemble import RandomForestRegressor
> 
> random_forest = RandomForestRegressor(
>   n_estimators=100,
> )
> random_forest = RandomForestClassifier(
>   n_estimators=100,
> )
> 
> random_forest.fit(X, y)    # train
> random_forest.predict(X)   # predict
> 
> importances = random_forest.feature_importances   # feature importance
> indices = np.argsort(importances)[::-1]
> importance_df = pd.DataFrame({
>    'feature': np.array(*feature_names*)[indices],
>    'importance': importances[indices]
> })
> ```

<br>

주요 파라미터
- n_estimators: tree의 수
- max_depth: 각 tree의 최대 깊이
- min_sample_split: 분기에 필요한 최소 데이터 수
- max_features: 분기 시 고려할 최대 feature 수
- random_state: 난수 고정을 위한 seed

In [68]:
penguins = sns.load_dataset('penguins')
penguins


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [69]:
lable_encoders = {}

for column in ('species', 'sex'):
    lable_encoder = LabelEncoder()
    penguins.loc[:, column] = lable_encoder.fit_transform(penguins[column])

    lable_encoders.update({column : lable_encoder})

In [70]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Torgersen,39.1,18.7,181.0,3750.0,1
1,0,Torgersen,39.5,17.4,186.0,3800.0,0
2,0,Torgersen,40.3,18.0,195.0,3250.0,0
3,0,Torgersen,,,,,2
4,0,Torgersen,36.7,19.3,193.0,3450.0,0
...,...,...,...,...,...,...,...
339,2,Biscoe,,,,,2
340,2,Biscoe,46.8,14.3,215.0,4850.0,0
341,2,Biscoe,50.4,15.7,222.0,5750.0,1
342,2,Biscoe,45.2,14.8,212.0,5200.0,0


In [71]:
penguins.dropna().drop_duplicates().reset_index(drop= True)
penguins = penguins.drop(columns=['island'])

In [85]:
train1, test1 = train_test_split(penguins, test_size= 0.3, random_state=0)

In [86]:
random_forest = RandomForestClassifier(random_state= 0)
random_forest.fit(train1.drop(columns=['species']), train1.species.astype(str))

In [87]:
print(f'train_acc: {(random_forest.predict(train1.drop(columns=['species'])) == train1.species.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test1.drop(columns=['species'])) == test1.species.astype(str)).mean()}')

train_acc: 0.9958333333333333
test_acc: 0.9903846153846154


In [75]:
diamonds = sns.load_dataset('diamonds')
diamonds = diamonds.query('`carat` < 0.3')
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
52966,0.24,Ideal,F,VVS2,61.1,57.0,552,4.03,4.06,2.47
52967,0.24,Ideal,F,VVS2,62.6,57.0,552,3.93,3.96,2.47
52968,0.24,Ideal,F,VVS2,61.8,56.0,552,3.97,4.02,2.47
52969,0.24,Ideal,F,VVS2,61.6,54.0,552,4.00,4.09,2.49


In [76]:
lable_encoders = {}

for column in ('cut', 'color', 'clarity'):
    lable_encoder = LabelEncoder()
    diamonds.loc[:, column] = lable_encoder.fit_transform(diamonds[column])

    lable_encoders.update({column : lable_encoder})

  diamonds.loc[:, column] = lable_encoder.fit_transform(diamonds[column])
  diamonds.loc[:, column] = lable_encoder.fit_transform(diamonds[column])
  diamonds.loc[:, column] = lable_encoder.fit_transform(diamonds[column])


In [78]:
diamonds = diamonds.dropna().drop_duplicates().reset_index(drop=True)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,3,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,4,62.4,58.0,334,4.20,4.23,2.63
4,0.24,4,6,6,62.8,57.0,336,3.94,3.96,2.48
...,...,...,...,...,...,...,...,...,...,...
1594,0.24,2,2,6,61.1,57.0,552,4.03,4.06,2.47
1595,0.24,2,2,6,62.6,57.0,552,3.93,3.96,2.47
1596,0.24,2,2,6,61.8,56.0,552,3.97,4.02,2.47
1597,0.24,2,2,6,61.6,54.0,552,4.00,4.09,2.49


In [79]:
train, test = train_test_split(diamonds, test_size= 0.3, random_state=0)

random_forest = RandomForestClassifier(random_state= 0)
random_forest.fit(train.drop(columns=['cut']), train.cut.astype(str))

In [82]:
train.cut


92      2
1017    4
1447    1
838     4
40      4
       ..
763     4
835     4
1216    1
559     1
684     1
Name: cut, Length: 1119, dtype: int32

In [83]:
print(f'train_acc: {(random_forest.predict(train.drop(columns=['cut'])) == train.cut.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test.drop(columns=['cut'])) == test.cut.astype(str)).mean()}')

train_acc: 1.0
test_acc: 0.7875


In [None]:
RandomForestClassifier(min_samples_split=)

In [123]:
from sklearn.model_selection import RandomizedSearchCV

random_forest = RandomForestClassifier(random_state= 0)
# from scipy.stats import uniform

parameters = {'max_depth': range(1, 201), 'max_leaf_nodes': range(1, 201), 'n_estimators': range(1, 201), 'min_samples_split' : range(1, 201)}
random_search = RandomizedSearchCV(random_forest, parameters, random_state=0, scoring='accuracy')
random_search.fit(diamonds.drop(columns=['cut']), diamonds.cut)

In [124]:
random_search.best_params_

{'n_estimators': 192,
 'min_samples_split': 23,
 'max_leaf_nodes': 170,
 'max_depth': 50}

In [125]:
random_forest = RandomForestClassifier()

In [126]:
train, test = train_test_split(diamonds, test_size= 0.3, random_state=0)

random_forest = RandomForestClassifier(random_state= 0 ,n_estimators = 192, max_leaf_nodes = 170, max_depth = 50, min_samples_split= 23)
random_forest.fit(train.drop(columns=['cut']), train.cut.astype(str))

In [127]:
print(f'train_acc: {(random_forest.predict(train.drop(columns=['cut'])) == train.cut.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test.drop(columns=['cut'])) == test.cut.astype(str)).mean()}')

train_acc: 0.872207327971403
test_acc: 0.725


In [128]:
index_best_param = random_search.cv_results_.get('rank_test_score').argmin()

In [129]:
best_params = random_search.cv_results_.get('params')[index_best_param]
best_params

{'n_estimators': 192,
 'min_samples_split': 23,
 'max_leaf_nodes': 170,
 'max_depth': 50}

In [130]:
random_forest =RandomForestClassifier(
    n_estimators = best_params.get('n_estimators'),
    min_samples_split = best_params.get('min_samples_split'),
    max_leaf_nodes = best_params.get('max_leaf_nodes'),
    max_depth = best_params.get('max_depth'),
    random_state=0
)

random_forest.fit(train.drop(columns=['cut']), train.cut.astype(str))

print(f'train_acc: {(random_forest.predict(train.drop(columns=['cut'])) == train.cut.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test.drop(columns=['cut'])) == test.cut.astype(str)).mean()}')

train_acc: 0.872207327971403
test_acc: 0.725


In [131]:
# titanic

titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [132]:
label_encoders = {}

for column in ('sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alone'):
    lable_encoder = LabelEncoder()
    titanic.loc[:, column] = lable_encoder.fit_transform(titanic[column])

    lable_encoders.update({column : lable_encoder})

 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 0 1 0 0 1 2 1 2 2 0 0 2 0 2 1 2 2 2 1 2 1 2
 2 2 2 2 1 2 2 2 2 0 1 2 2 2 0 2 2 2 0 2 2 2 0 0 1 1 2 2 0 2 2 2 2 2 2 2 0
 2 2 2 2 2 2 1 0 2 1 2 1 1 0 2 2 2 2 2 2 2 2 1 1 1 0 0 2 0 2 2 2 2 1 1 2 2
 1 1 1 0 2 2 2 0 2 2 2 2 2 1 2 2 2 2 0 2 0 2 0 2 2 2 0 2 2 0 1 2 2 1 2 1 2
 0 2 0 2 2 1 1 2 1 0 0 2 2 2 1 2 2 2 2 2 2 2 2 2 0 2 1 2 1 2 0 2 1 0 1 2 1
 2 2 0 2 1 2 1 2 0 2 1 2 1 2 1 1 1 1 2 2 1 2 2 0 2 1 0 1 2 2 0 2 2 2 0 0 0
 1 2 2 0 0 2 1 2 2 0 0 0 2 1 0 2 0 2 1 2 2 2 2 2 2 0 2 2 2 1 2 0 0 1 2 2 0
 2 0 0 0 2 2 2 1 2 0 0 0 1 0 0 0 1 2 1 2 1 1 0 0 2 2 1 1 2 0 2 1 2 0 2 0 0
 2 0 2 0 0 2 0 1 0 1 1 1 1 1 2 2 2 2 0 2 2 2 2 0 1 2 2 2 1 2 2 2 2 0 2 2 0
 0 2 2 0 2 0 2 0 2 2 0 2 2 0 2 1 2 1 2 1 0 2 2 0 2 2 2 1 1 1 2 2 2 2 2 1 2
 1 2 2 2 2 0 1 2 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 0 2 1 2 0 0 2 1 0 1 1 2 2 1
 2 0 1 0 2 0 1 2 0 0 2 2 0 0 1 2 0 2 0 1 2 2 1 0 2 2 2 2 1 1 2 0 1 2 2 2 2
 1 2 2 0 2 0 0 2 2 2 2 0 0 2 2 0 2 0 2 2 2 2 2 0 0 1 0 2 2 2 2 0 0 2 0 1 2
 1 2 0 2 2 0 2 2 1 0 2 1 

In [135]:
titanic = titanic.dropna().drop_duplicates().drop(columns='alive').reset_index(drop=True)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,0,3,1,22.0,1,0,7.2500,2,2,1,1,7,2,0
1,1,1,0,38.0,1,0,71.2833,0,0,2,0,2,0,0
2,1,3,0,26.0,0,0,7.9250,2,2,2,0,7,2,1
3,1,1,0,35.0,1,0,53.1000,2,0,2,0,2,2,0
4,0,3,1,35.0,0,0,8.0500,2,2,1,1,7,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,0,2,1,28.0,0,0,10.5000,2,1,1,1,7,2,1
674,0,3,0,39.0,0,5,29.1250,1,2,2,0,7,1,0
675,1,1,0,19.0,0,0,30.0000,2,0,2,0,1,2,1
676,1,1,1,26.0,0,0,30.0000,0,0,1,1,2,0,1


In [141]:
from sklearn.model_selection import RandomizedSearchCV

random_forest = RandomForestClassifier(random_state= 0)
# from scipy.stats import uniform

parameters = {'max_depth': range(1, 201), 'max_leaf_nodes': range(1, 201), 'n_estimators': range(1, 201), 'min_samples_split' : range(1, 201)}
random_search = RandomizedSearchCV(random_forest, parameters, random_state=0, scoring='accuracy')
random_search.fit(titanic.drop(columns=['survived']), titanic.survived)

In [142]:
train, test = train_test_split(titanic, test_size= 0.3, random_state=0)

In [143]:
index_best_param = random_search.cv_results_.get('rank_test_score').argmin()
best_params = random_search.cv_results_.get('params')[index_best_param]
best_params

{'n_estimators': 54,
 'min_samples_split': 53,
 'max_leaf_nodes': 66,
 'max_depth': 79}

In [144]:
random_forest =RandomForestClassifier(
    n_estimators = best_params.get('n_estimators'),
    min_samples_split = best_params.get('min_samples_split'),
    max_leaf_nodes = best_params.get('max_leaf_nodes'),
    max_depth = best_params.get('max_depth'),
    random_state=0
)

random_forest.fit(train.drop(columns=['survived']), train.survived.astype(str))

print(f'train_acc: {(random_forest.predict(train.drop(columns=['survived'])) == train.survived.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test.drop(columns=['survived'])) == test.survived.astype(str)).mean()}')

train_acc: 0.8185654008438819
test_acc: 0.8382352941176471


In [140]:
best_params

{'n_estimators': 192,
 'min_samples_split': 23,
 'max_leaf_nodes': 170,
 'max_depth': 50}

In [149]:
titanic = sns.load_dataset('titanic')

titanic.deck = np.where(titanic.deck.astype(str) == 'nan', 'Z', titanic.deck)
titanic = titanic.drop(columns=['alive']).dropna().drop_duplicates().reset_index(drop= True)

titanic.info()

label_encoders = {}

for column in ('sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alone'):
    lable_encoder = LabelEncoder()
    titanic.loc[:, column] = lable_encoder.fit_transform(titanic[column])

    lable_encoders.update({column : lable_encoder})

train, test = train_test_split(titanic, test_size=0.3, random_state= 0)

random_forest = RandomForestClassifier(random_state=0)
random_forest.fit(titanic.drop(columns=['survived']), titanic.survived)

print(f'train_acc: {(random_forest.predict(train.drop(columns=['survived'])) == train.survived.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test.drop(columns=['survived'])) == test.survived.astype(str)).mean()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 676 entries, 0 to 675
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     676 non-null    int64   
 1   pclass       676 non-null    int64   
 2   sex          676 non-null    object  
 3   age          676 non-null    float64 
 4   sibsp        676 non-null    int64   
 5   parch        676 non-null    int64   
 6   fare         676 non-null    float64 
 7   embarked     676 non-null    object  
 8   class        676 non-null    category
 9   who          676 non-null    object  
 10  adult_male   676 non-null    bool    
 11  deck         676 non-null    object  
 12  embark_town  676 non-null    object  
 13  alone        676 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 60.3+ KB
train_acc: 0.0
test_acc: 0.0


 2 0 1 0 1 2 1 2 2 0 2 1 2 2 2 1 2 1 2 2 2 1 2 2 2 0 1 2 2 0 2 2 2 0 2 2 0
 0 1 1 2 0 2 2 2 2 2 0 2 2 2 2 2 2 1 0 2 1 1 1 0 2 2 2 2 2 2 1 1 0 0 2 0 2
 2 2 1 1 2 2 1 1 1 0 2 2 0 2 2 2 1 2 2 2 2 2 2 0 2 2 0 2 0 1 2 2 1 2 0 2 2
 1 1 2 1 0 0 2 1 2 2 2 2 2 2 2 2 0 2 1 2 0 2 1 0 1 2 1 2 0 2 1 2 1 0 2 1 2
 1 1 1 1 1 1 2 2 0 2 1 0 1 2 0 2 2 2 0 0 1 2 0 0 1 2 2 0 0 2 1 0 0 2 2 2 2
 2 2 2 2 2 2 1 2 0 0 1 2 2 2 0 0 2 0 0 1 0 0 0 1 1 2 1 1 0 0 2 1 1 0 2 1 2
 0 0 0 2 0 0 2 0 1 0 1 1 1 1 2 2 2 2 2 0 1 2 1 2 2 2 0 0 0 2 2 0 2 2 0 2 2
 0 2 2 0 1 2 1 1 0 2 2 0 2 2 2 1 1 1 2 2 2 2 2 1 2 1 2 0 2 1 1 2 2 2 2 2 1
 1 2 0 1 2 0 0 2 1 0 1 1 2 2 1 0 1 0 2 0 1 0 0 2 0 1 0 2 0 1 2 0 2 2 1 1 2
 2 2 2 2 2 2 0 0 0 2 2 0 0 0 0 2 2 2 0 0 1 2 2 2 0 0 2 0 1 1 2 0 0 2 2 1 1
 2 2 1 0 0 0 0 2 2 1 0 0 1 2 1 0 1 2 2 0 0 0 2 2 1 2 2 2 2 1 0 0 2 2 1 0 2
 1 0 1 0 0 1 0 2 0 2 1 2 2 0 1 2 0 2 2 0 1 0 2 1 2 2 1 1 2 0 2 2 2 0 1 0 2
 0 2 0 2 1 2 2 0 0 2 1 2 2 1 2 1 0 0 2 0 2 2 1 2 1 0 1 1 2 2 2 2 0 0 2 2 1
 1 2 2 2 0 0 2 2 0 1 0 2 

In [147]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,0,3,1,22.0,1,0,7.2500,2,2,1,1,7,2,0
1,1,1,0,38.0,1,0,71.2833,0,0,2,0,2,0,0
2,1,3,0,26.0,0,0,7.9250,2,2,2,0,7,2,1
3,1,1,0,35.0,1,0,53.1000,2,0,2,0,2,2,0
4,0,3,1,35.0,0,0,8.0500,2,2,1,1,7,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,0,2,1,28.0,0,0,10.5000,2,1,1,1,7,2,1
672,0,3,0,39.0,0,5,29.1250,1,2,2,0,7,1,0
673,1,1,0,19.0,0,0,30.0000,2,0,2,0,1,2,1
674,1,1,1,26.0,0,0,30.0000,0,0,1,1,2,0,1


In [164]:
from sklearn.model_selection import RandomizedSearchCV

random_forest = RandomForestClassifier(random_state= 0)
# from scipy.stats import uniform

parameters = {
    'max_depth': np.random.randint(1, 10, 5),
    #'max_leaf_nodes': np.random.randint(5, 20, 5),
    'n_estimators': np.random.randint(50, 500, 20),
    'min_samples_split' : np.random.randint(50, 500, 20),
    }
random_search = RandomizedSearchCV(random_forest, parameters, random_state=0)
random_search.fit(titanic.drop(columns=['survived']), titanic.survived)

index_best_param = random_search.cv_results_.get('rank_test_score').argmax()
best_params = random_search.cv_results_.get('params')[index_best_param]
best_params

random_forest =RandomForestClassifier(
    n_estimators = best_params.get('n_estimators'),
    min_samples_split = best_params.get('min_samples_split'),
    #max_leaf_nodes = best_params.get('max_leaf_nodes'),
    max_depth = best_params.get('max_depth'),
    random_state=0
).fit(train.drop(columns=['survived']), train.survived.astype(str))


print(f'train_acc: {(random_forest.predict(train.drop(columns=['survived'])) == train.survived.astype(str)).mean()}')
print(f'test_acc: {(random_forest.predict(test.drop(columns=['survived'])) == test.survived.astype(str)).mean()}')

train_acc: 0.5898520084566596
test_acc: 0.5665024630541872


In [172]:
index_best_param = random_search.cv_results_.get('rank_test_score').argmax()
best_params = random_search.cv_results_.get('params')[index_best_param]

In [206]:
random_search.best_params_

{'n_estimators': 113, 'min_samples_split': 167, 'max_depth': 5}

In [173]:
best_params

{'n_estimators': 266, 'min_samples_split': 452, 'max_depth': 3}

In [151]:
best_params

{'n_estimators': 180,
 'min_samples_split': 59,
 'max_leaf_nodes': 20,
 'max_depth': 25}

In [176]:
data = pd.read_csv('../stock_data/삼성2023.csv', encoding='cp949', usecols=['일자', '종가'])
data

Unnamed: 0,일자,종가
0,2023/12/28,78500
1,2023/12/27,78000
2,2023/12/26,76600
3,2023/12/22,75900
4,2023/12/21,75000
...,...,...
240,2023/01/06,59000
241,2023/01/05,58200
242,2023/01/04,57800
243,2023/01/03,55400


In [184]:
data.일자 = pd.to_datetime(data.일자)
data = data.sort_values(by=['일자']).set_index('일자')
data

Unnamed: 0_level_0,종가
일자,Unnamed: 1_level_1
2023-01-02,55500
2023-01-03,55400
2023-01-04,57800
2023-01-05,58200
2023-01-06,59000
...,...
2023-12-21,75000
2023-12-22,75900
2023-12-26,76600
2023-12-27,78000


In [205]:
slided_price = pd.DataFrame(
    np.lib.stride_tricks.sliding_window_view(data.종가.values, 6),
    columns= ['t_4','t_3','t_2','t_1','t','t+1',],
    index = data.index[5:]
    )

slided_price

Unnamed: 0_level_0,t_4,t_3,t_2,t_1,t,t+1
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-09,55500,55400,57800,58200,59000,60700
2023-01-10,55400,57800,58200,59000,60700,60400
2023-01-11,57800,58200,59000,60700,60400,60500
2023-01-12,58200,59000,60700,60400,60500,60500
2023-01-13,59000,60700,60400,60500,60500,60800
...,...,...,...,...,...,...
2023-12-21,73100,73300,72900,73400,74800,75000
2023-12-22,73300,72900,73400,74800,75000,75900
2023-12-26,72900,73400,74800,75000,75900,76600
2023-12-27,73400,74800,75000,75900,76600,78000


In [207]:
train, test = train_test_split(slided_price, test_size=0.3, random_state=0, shuffle=False)

random_forest = RandomForestRegressor(random_state=0)
random_forest.fit(train.iloc[:, :-1], train.iloc[:, -1])

In [211]:
results = pd.DataFrame(
    random_forest.predict(test.drop(columns=['t+1'])),
    columns=['predicted'],
    index= test.index
    )

results['ground_truth'] = test['t+1']
results

Unnamed: 0_level_0,predicted,ground_truth
일자,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-09-12,70712.0,70500
2023-09-13,70512.0,70900
2023-09-14,71234.0,71700
2023-09-15,71057.0,72000
2023-09-18,71881.0,70200
...,...,...
2023-12-21,72284.0,75000
2023-12-22,72275.0,75900
2023-12-26,72275.0,76600
2023-12-27,72275.0,78000


In [212]:
px.line(
    results,
    x = results.index,
    y = ['predicted', 'ground_truth']
)

In [241]:

diamonds = sns.load_dataset('diamonds')
diamonds

# 1. 중복, 결측 확인
diamonds = diamonds.drop_duplicates().reset_index(drop=True)
diamonds
# 2. label encoder를 통한 범주형 인코딩
lable_encoders = {}

for column in ('cut', 'color', 'clarity'):
    lable_encoder = LabelEncoder()
    diamonds.loc[:, column] = lable_encoder.fit_transform(diamonds[column])

    lable_encoders.update({column : lable_encoder})

#diamonds.info()

# 3. train, test split (random_state=0)
train, test = train_test_split(diamonds, test_size=0.3, random_state=0)
# 4. price 예측

random_forest = RandomForestRegressor(random_state=0)
random_forest.fit(train.drop(columns=['price']), train.price)

# 5. 예측 값과 실제 값을 시각화하여 예측 성능 비교

results = pd.DataFrame(
    random_forest.predict(test.drop(columns=['price'])),
    columns=['predicted'],
    index= test.index
    )

results['ground_truth'] = test['price']
results




Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[2 3 1 ... 4 3 2]' has dtype incompatible with category, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[1 1 1 ... 0 4 0]' has dtype incompatible with category, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[3 2 4 ... 2 3 3]' has dtype incompatible with category, please explicitly cast to a compatible dtype first.



Unnamed: 0,predicted,ground_truth
28227,468.74,432
17317,6296.92,6964
48740,2067.48,2037
19859,7612.28,8451
44240,1700.38,1581
...,...,...
1711,577.04,561
2879,3217.93,3281
5108,3629.97,3762
51239,2297.54,2368


In [242]:
px.scatter(
    results,
    x = results.index,
    y = ['predicted', 'ground_truth']
)

In [236]:
results

Unnamed: 0,predicted,ground_truth
28227,468.74,432
17317,6296.92,6964
48740,2067.48,2037
19859,7612.28,8451
44240,1700.38,1581
...,...,...
1711,577.04,561
2879,3217.93,3281
5108,3629.97,3762
51239,2297.54,2368
