### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
# 必要なライブラリのインポート
from tpot import TPOTClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

In [2]:
# データのロード、説明変数、目的変数の定義
data_cancer = load_breast_cancer(as_frame=True)
X = data_cancer.data
y = data_cancer.target

In [4]:
# 各説明変数の確認
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [5]:
# 目的変数の確認
y

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: target, Length: 569, dtype: int64

In [6]:
# 目的変数のデータの偏りを確認
print(np.count_nonzero(y == 1))
print(np.count_nonzero(y == 0))

357
212


In [7]:
# ロードしたデータを学習用と評価用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

In [8]:
# 分割されたデータのレコード数を確認
print(len(X_train))
print(len(X_test))

455
114


In [9]:
# TPOTの分類器と、パラメータを定義
# scorering : Cross Validationに使う指標
# generation : 世代数
# population_size : 各世代で生成するパイプライン数
# verbosity : 学習中のステイタスの表示(0 - 3までの4段階で数値が大きいほど詳細)
# n_jobs : 学習処理の並列度(-1の指定により、全てのコアを利用する)

model_tpot = TPOTClassifier(scoring='f1',
                      generations=3,
                      population_size=50,
                      verbosity=2,
                      n_jobs=-1)

In [10]:
# 学習処理の実行
model_tpot.fit(X_train, y_train)

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=200.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.9827578518433091

Generation 2 - Current best internal CV score: 0.9827578518433091

Generation 3 - Current best internal CV score: 0.9827578518433091

Best pipeline: LinearSVC(MaxAbsScaler(ZeroCount(input_matrix)), C=20.0, dual=False, loss=squared_hinge, penalty=l1, tol=0.01)


TPOTClassifier(generations=3, n_jobs=-1, population_size=50, scoring='f1',
               verbosity=2)

In [11]:
# 最終的に採用されたベストなパイプラインの確認
model_tpot.fitted_pipeline_

Pipeline(steps=[('zerocount', ZeroCount()), ('maxabsscaler', MaxAbsScaler()),
                ('linearsvc',
                 LinearSVC(C=20.0, dual=False, penalty='l1', tol=0.01))])

In [12]:
# 学習途中のパイプラインの確認
model_tpot.evaluated_individuals_

{'GaussianNB(input_matrix)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 1,
  'internal_cv_score': 0.9484936196287965},
 'XGBClassifier(LinearSVC(input_matrix, LinearSVC__C=10.0, LinearSVC__dual=False, LinearSVC__loss=squared_hinge, LinearSVC__penalty=l1, LinearSVC__tol=0.1), XGBClassifier__learning_rate=0.01, XGBClassifier__max_depth=4, XGBClassifier__min_child_weight=10, XGBClassifier__n_estimators=100, XGBClassifier__n_jobs=1, XGBClassifier__subsample=0.05, XGBClassifier__verbosity=0)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': 0.7719209142027934},
 'BernoulliNB(PCA(input_matrix, PCA__iterated_power=8, PCA__svd_solver=randomized), BernoulliNB__alpha=0.01, BernoulliNB__fit_prior=False)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_scor

In [13]:
# 構築したモデルを評価用データで評価(メトリックはf1スコア)
y_pred = model_tpot.predict(X_test)
f1_score(y_true=y_test, y_pred=y_pred)

0.971830985915493

In [14]:
# 評価データの予測結果と正解ラベルを確認
pd.DataFrame(data={'y_pred':y_pred,'y_test':y_test})

Unnamed: 0,y_pred,y_test
204,1,1
70,0,0
131,0,0
431,1,1
540,1,1
...,...,...
486,1,1
75,0,0
249,1,1
238,0,1


In [15]:
# 混同マトリクス
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[41,  2],
       [ 2, 69]])