Estudo do algoritmo de aprendizagem de máquina: **CatBoostClassifier**

Desenvolvido pela empresa Yandex, nomeado o melhor algoritmo comparado ao XGBoost e LightBoost
*   Baseado em árvore de decisão
*   Predição rápida e eficiente
*   



In [None]:
pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/1e/21/d1718eb4c93d6bacdd540b3792187f32ccb1ad9c51b9c4f10875d63ec176/catboost-0.25-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 63kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25


In [None]:
import numpy as np
from catboost import CatBoostClassifier, Pool
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

#### Utilizando [exemplo](https://boosting-doc.readthedocs.io/zh_CN/latest/catboost/)

In [None]:
train_data = np.random.randint(0, 100, size = (100, 10))

In [None]:
#Dataset - 100 linhas e 10 colunas contendo valores inteiros entre 0 e 100
train_data

In [None]:
train_labels = np.random.randint(0, 2, size = (100))


In [None]:
#Labels - Coluna de Classificação - 0 e 1
train_labels

In [None]:
test_data = catboost_pool = Pool(train_data, train_labels)

In [None]:
#criando o modelo
model = CatBoostClassifier(iterations = 2, depth = 2, learning_rate = 1, loss_function = "Logloss", verbose = True)

In [None]:
#treinando o modelo
model.fit(train_data, train_labels)

0:	learn: 0.6408134	total: 48.5ms	remaining: 48.5ms
1:	learn: 0.6280534	total: 48.9ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f998ba06690>

In [None]:
#predicao usando o modelo
y_pred = model.predict(test_data)
y_pred_proba = model.predict_proba(test_data)

print("class = ", y_pred)
print("proba = ", y_pred_proba)

#### Iris

In [None]:
url = 'https://raw.githubusercontent.com/profmoisesomena/escience_and_tools/master/data/iris.csv'
names = ['sepal-length','sepal-width', 'petal-length', 'petal-width', 'class']

#Dados
dataset = pd.read_csv(url,header=0)

In [None]:
dataset.values

In [None]:
#Labels - Classificação
labels = dataset.species.values

In [None]:
#tirando a coluna de classificacao
data = dataset.drop(['species'], axis=1).values

In [None]:
#criando o modelo
model_iris = CatBoostClassifier(iterations = 5, depth = 2, learning_rate = 1, loss_function = "MultiClass", verbose = True)

In [None]:
#treinando o modelo
model_iris.fit(data, labels)

0:	learn: 0.2581534	total: 46.5ms	remaining: 186ms
1:	learn: 0.1639388	total: 47.5ms	remaining: 71.2ms
2:	learn: 0.1372688	total: 48.1ms	remaining: 32.1ms
3:	learn: 0.1224746	total: 48.6ms	remaining: 12.2ms
4:	learn: 0.1007498	total: 49.2ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f53569bcd10>

In [None]:
test_data = catboost_pool = Pool(data, labels)

In [None]:
#predicao usando o modelo
y_pred = model_iris.predict(test_data)
y_prob = model_iris.predict_proba(test_data)
print("class = ", y_pred)
print("proba = ", y_prob)

In [None]:
model_iris.predict([4.9,3.0,1.4,0.2])

array(['Iris-setosa'], dtype=object)

In [None]:
dataset['predito'] = y_pred

In [None]:
dataset.head()

Trabalhando o modelo

In [None]:
n_scores = cross_val_score(model_iris, data, labels, scoring='accuracy', n_jobs=-1, error_score='raise')
print('Precisão: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Precisão: 0.960 (0.039)


In [None]:
print(model_iris.feature_importances_)

[ 3.67131966  2.83790384 10.40336266 83.08741384]


In [None]:
model_iris.predict([5.3, 3.3, 1.0, 0.5])

array(['Iris-setosa'], dtype=object)

#### Titanic

In [None]:
url = 'https://raw.githubusercontent.com/helenfranca/lap1/helen/data_titanic.csv'
#names = ['sepal-length','sepal-width', 'petal-length', 'petal-width', 'class']

#Dados
data_titanic = pd.read_csv(url)

In [None]:
data_titanic.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,F,S,mr
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C,mrs
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,F,S,miss
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S,mrs
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,F,S,mr


In [None]:
data_titanic.drop('Unnamed: 0', axis=1, inplace=True)
data_titanic.drop('Name', axis=1, inplace=True)
data_titanic.drop('PassengerId', axis=1, inplace=True)


In [None]:
index = data_titanic[ data_titanic['Embarked'].isnull() ].index
data_titanic.drop(index , inplace=True)

In [None]:
data_titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,male,22.0,1,0,A/5 21171,7.25,F,S,mr
1,1,1,female,38.0,1,0,PC 17599,71.2833,C,C,mrs
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,F,S,miss
3,1,1,female,35.0,1,0,113803,53.1,C,S,mrs
4,0,3,male,35.0,0,0,373450,8.05,F,S,mr


Separando os dados do label


In [None]:
titanic = data_titanic.drop(['Survived'], axis=1).values
labels_titanic = data_titanic.Survived.values

In [None]:
titanic[0]

array([3, 'male', 22.0, 1, 0, 'A/5 21171', 7.25, 'F', 'S', 'mr'],
      dtype=object)

Criando modelo


In [None]:
model_titanic = CatBoostClassifier( iterations = 10, depth = 2, learning_rate = 1, loss_function = "Logloss", verbose = True, cat_features=[1,5,7,8,9])

In [None]:
model_titanic2 = CatBoostClassifier(cat_features=[1,5,7,8,9])


Treinando o modelo

In [None]:
model_titanic.fit(titanic, labels_titanic)

0:	learn: 0.4864554	total: 1.18ms	remaining: 10.6ms
1:	learn: 0.4412554	total: 2.55ms	remaining: 10.2ms
2:	learn: 0.4257501	total: 3.4ms	remaining: 7.93ms
3:	learn: 0.4245040	total: 4.28ms	remaining: 6.41ms
4:	learn: 0.4146109	total: 5.17ms	remaining: 5.17ms
5:	learn: 0.4084099	total: 5.97ms	remaining: 3.98ms
6:	learn: 0.4002254	total: 6.72ms	remaining: 2.88ms
7:	learn: 0.3948135	total: 7.54ms	remaining: 1.89ms
8:	learn: 0.3895441	total: 8.33ms	remaining: 925us
9:	learn: 0.3873966	total: 9.1ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f534f779490>

In [None]:
model_titanic2.fit(titanic,labels_titanic)

In [None]:
test_titanic = catboost_pool = Pool(titanic, labels_titanic, cat_features=[1,5,7,8,9])

In [None]:
#predicao usando o modelo
y_pred = model_titanic.predict(test_titanic)
#print("class = ", y_pred)


In [None]:
y_pred2 = model_titanic2.predict(test_titanic)

In [None]:
data_titanic["sobreviveu"] = y_pred

In [None]:
data_titanic["predito2"] = y_pred2

In [None]:
data_titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,0,3,male,22.000000,1,0,A/5 21171,7.2500,F,S,mr
1,1,1,female,38.000000,1,0,PC 17599,71.2833,C,C,mrs
2,1,3,female,26.000000,0,0,STON/O2. 3101282,7.9250,F,S,miss
3,1,1,female,35.000000,1,0,113803,53.1000,C,S,mrs
4,0,3,male,35.000000,0,0,373450,8.0500,F,S,mr
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,211536,13.0000,F,S,rev
887,1,1,female,19.000000,0,0,112053,30.0000,B,S,miss
888,0,3,female,21.773973,1,2,W./C. 6607,23.4500,F,S,miss
889,1,1,male,26.000000,0,0,111369,30.0000,C,C,mr


In [None]:

index2 = data_titanic[ data_titanic['Survived'] != data_titanic['sobreviveu'] ].index
index3 = data_titanic[ data_titanic['Survived'] != data_titanic['predito2'] ].index

print("Comparando diferença de acertos entre as predições")
print("Sobreviveu x Survived: ", index2.size)
print("Survived x Predito2: ", index3.size)


Comparando diferença de acertos entre as predições
Sobreviveu x Survived:  89
Survived x Predito2:  74


Ao que parece a configuração padrão do modelo deu um resultado mais perto do esperado. Com uma qtd de erros menor.

In [None]:
n_scores = cross_val_score(model_titanic, titanic, labels_titanic, scoring='accuracy', n_jobs=-1, error_score='raise')
print('Precisão: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Precisão: 0.826 (0.010)


In [None]:
n_scores = cross_val_score(model_titanic2, titanic, labels_titanic, scoring='accuracy', n_jobs=-1, error_score='raise')
print('Precisão: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Precisão: 0.826 (0.017)


In [None]:
model_titanic.predict([2, 'female', 30.0, 1, 0, 'A/5 21171', 7.25, 'F', 'S', 'mrs'])

1

In [None]:
model_titanic2.predict([2, 'female', 30.0, 1, 0, 'A/5 21171', 7.25, 'F', 'S', 'mrs'])

1

Alguns Testes Extras e Aleatórios

In [None]:

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [None]:
array = data_titanic.values
X = array[:,1:]
y = array[:,:1]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.25, random_state=0, shuffle=True)

model = CatBoostClassifier(cat_features=[1,5,7,8,9])
model.fit(X_train, Y_train)

predictions_train = model.predict(X_train)
predictions_validation = model.predict(X_validation)

print(sklearn.metrics.accuracy_score(Y_train, predictions_train))
print(sklearn.metrics.accuracy_score(Y_validation, predictions_validation))