In [69]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
dados = pd.read_csv('dados_tratados.csv')

### Seleção de atributos

Análise de correlação

In [70]:
df = dados.copy()
corr_matrix = df.corr()
corr_matrix

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,evalutation
buying,1.0,-1.356939e-15,-3.4334680000000003e-17,-1.530089e-16,-1.045866e-16,-4.082114e-16,-0.28275
maint,-1.356939e-15,1.0,-2.547345e-16,-4.265105e-17,8.544286e-17,-7.826398000000001e-17,-0.232422
doors,-3.4334680000000003e-17,-2.547345e-16,1.0,1.956599e-17,-2.674488e-18,3.406454e-17,0.066057
persons,-1.530089e-16,-4.265105e-17,1.956599e-17,1.0,-9.251859000000001e-18,1.137208e-17,0.341707
lug_boot,-1.045866e-16,8.544286e-17,-2.674488e-18,-9.251859000000001e-18,1.0,1.927471e-18,0.157932
safety,-4.082114e-16,-7.826398000000001e-17,3.406454e-17,1.137208e-17,1.927471e-18,1.0,0.439337
evalutation,-0.2827504,-0.2324215,0.06605665,0.3417068,0.1579317,0.4393373,1.0


Principal Component Analysis (PCA)

In [71]:
df = dados.copy()
from sklearn.decomposition import PCA
X = df.drop('evalutation',axis=1)
y = df['evalutation']
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X)
X_pca

array([[ 1.5       ,  1.5       ,  1.5       , -0.68122185],
       [ 1.5       ,  1.5       ,  1.5       ,  0.05610062],
       [ 1.5       ,  1.5       ,  1.5       ,  0.79342309],
       ...,
       [-1.5       , -1.5       , -1.5       , -0.79342309],
       [-1.5       , -1.5       , -1.5       , -0.05610062],
       [-1.5       , -1.5       , -1.5       ,  0.68122185]])

### Detecção de outliers

In [72]:
from sklearn.ensemble import IsolationForest
df = dados.copy()
x = df.iloc[:,:].values
clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.1,
                      max_features=1.0, bootstrap=False, random_state=0)
clf.fit(x)

IsolationForest(contamination=0.1, random_state=0)

In [73]:
y_pred = clf.predict(x)
df['outlier'] = y_pred

In [74]:
df.outlier.value_counts()

 1    1555
-1     173
Name: outlier, dtype: int64

Interpretação: -1 : Outliers (fora do padrão), 1: Inliers, dentro do padrão

Outro algoritmo

In [75]:
from sklearn.covariance import EllipticEnvelope
df = dados.copy()
robust_cov = EllipticEnvelope(random_state=0, contamination=0.1)
x = df.iloc[:,:].values
robust_cov.fit(x)

EllipticEnvelope(random_state=0)

In [76]:
outliers = robust_cov.predict(x) == -1
df['outlier'] = outliers


In [77]:
df.outlier.value_counts()

False    1555
True      173
Name: outlier, dtype: int64

### Separação em dados de treino e teste

In [78]:
df = dados.copy()
from sklearn.model_selection import train_test_split
x = df.drop('evalutation', axis=1)
y = df['evalutation']

In [79]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
x_train.shape,x_test.shape

((1382, 6), (346, 6))

In [81]:
y_train.shape,y_test.shape

((1382,), (346,))

### Balanceamento de classes
Somente para problemas de classificação e somente com dados de treino

In [82]:
df = dados.copy()
df.evalutation.value_counts()

0    1210
1     384
2      69
3      65
Name: evalutation, dtype: int64

In [83]:
x = df.drop('evalutation', axis=1)
y = df['evalutation']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
y_train.value_counts()


0    975
1    301
2     58
3     48
Name: evalutation, dtype: int64

#### Oversampling simples

In [85]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_train_balanceado,y_train_balanceado = ros.fit_resample(x_train,y_train)

In [86]:
y_train_balanceado.value_counts()

0    975
1    975
3    975
2    975
Name: evalutation, dtype: int64

#### Undersampling simples

In [87]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
x_train_balanceado,y_train_balanceado = rus.fit_resample(x_train,y_train)

In [89]:
y_train_balanceado.value_counts()

0    48
1    48
2    48
3    48
Name: evalutation, dtype: int64

#### Métodos mais avançados

In [98]:
from imblearn.combine import SMOTEENN
smen = SMOTEENN(random_state=42)
x_train_balanceado,y_train_balanceado = smen.fit_resample(x_train,y_train)

In [100]:
y_train_balanceado.value_counts()

3    971
2    935
1    822
0    683
Name: evalutation, dtype: int64

In [101]:
from imblearn.combine import SMOTETomek
smto = SMOTETomek(random_state=42)
x_train_balanceado,y_train_balanceado = smto.fit_resample(x_train,y_train)

In [102]:
y_train_balanceado.value_counts()

0    975
1    975
3    975
2    975
Name: evalutation, dtype: int64

### Exemplo de treino de algoritmo de Machine Learning de Classificação com os dados balanceados pelo SMEN

In [103]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(x_train_balanceado,y_train_balanceado)


RandomForestClassifier()

In [104]:
y_pred = random_forest.predict(x_test)

In [107]:
y_test.values

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1,
       1, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 3,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3, 0, 0, 1, 3, 1, 0, 1,
       3, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 3, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 3, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       3, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 3, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 2, 3, 1, 1, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0,

In [106]:
y_pred

array([0, 2, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 1,
       2, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 3,
       0, 1, 1, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 0, 0, 1, 3, 1, 0, 1,
       3, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 3, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0,
       2, 0, 0, 1, 0, 3, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 3, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 3,
       3, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 3, 0, 0, 2, 2, 0, 1, 0, 1, 1, 0, 0,
       1, 2, 3, 1, 1, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0,

In [109]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       235
           1       0.99      0.88      0.93        83
           2       0.53      0.82      0.64        11
           3       0.85      1.00      0.92        17

    accuracy                           0.97       346
   macro avg       0.84      0.92      0.87       346
weighted avg       0.97      0.97      0.97       346



Fazendo o mesmo processo com os dados sem balanceamento

In [110]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier()
random_forest.fit(x_train,y_train)


RandomForestClassifier()

In [111]:
y_pred = random_forest.predict(x_test)

In [112]:
y_test.values

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1,
       1, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 3,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3, 0, 0, 1, 3, 1, 0, 1,
       3, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 3, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 3, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       3, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 3, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 2, 3, 1, 1, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0,

In [113]:
y_pred

array([0, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 1, 1,
       2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 3,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 3, 0, 0, 1, 3, 1, 0, 1,
       2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 3, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0,
       2, 0, 0, 1, 0, 3, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 3, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 0, 3,
       3, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 3, 0, 0, 0, 1, 0, 0, 3, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 2, 3, 1, 1, 0, 0, 0, 0, 3, 1, 1, 0, 0, 0,

In [114]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       235
           1       0.97      0.89      0.93        83
           2       0.53      0.91      0.67        11
           3       0.87      0.76      0.81        17

    accuracy                           0.96       346
   macro avg       0.84      0.89      0.85       346
weighted avg       0.97      0.96      0.96       346

