# Classificação - detecção de falhas: anomalias na água

## 0. Processamento dos Dados

### Funções

In [None]:
### FUNCTIONS ####

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    #print(cm)
    
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

### Carregando dados de treinamento

In [2]:
### LOADING TRAIN DATA ###

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

path_train = '../Data/Fault Detection/training.csv'
path_test = '../Data/Fault Detection/testing.csv'


# load data train
train = pd.read_csv(path_train)

# Adjust data
train = train.loc[:, ~train.columns.str.contains('^Unnamed')] # Remove incorrect column
train['Time'] = pd.to_datetime(train['Time']) # Convert to timestamp
for i in range(1,10): # Convert others to numeric
    train.iloc[:,i] = pd.to_numeric(train.iloc[:,i], errors='coerce')

###PLOT TRAIN DATA####
#plt.scatter(train.loc[~train['EVENT'],'Time'].values, train.loc[~train['EVENT'],'Tp'], s=10, c='blue')
#plt.scatter(train.loc[train['EVENT'],'Time'].values, train.loc[train['EVENT'],'Tp'], s=10, c='red')
#plt.ylabel('Tp'), plt.xlabel('Time'), plt.xticks(rotation=45);

# Simple imputation (last data)
nrow, ncol = train.shape
for r in range(nrow):
    for c in range(ncol):
        if train.iloc[r,c] != train.iloc[r,c]: # check if value is NaN
            train.iloc[r,c] = train.iloc[r-1,c]

# Visualize data for tain
train.head(3)

Unnamed: 0,Time,Tp,Cl,pH,Redox,Leit,Trueb,Cl_2,Fm,Fm_2,EVENT
0,2016-08-03 06:49:00,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1677.0,695.0,False
1,2016-08-03 06:50:00,6.5,0.17,8.36,749.0,211.0,0.011,0.118,1561.0,696.0,False
2,2016-08-03 06:51:00,6.5,0.17,8.35,749.0,211.0,0.011,0.117,1581.0,696.0,False


### Carregando os dados de teste

In [None]:
### LOADING TEST DATA ###

test = pd.read_csv(path_test)

# Adjust data
test = test.loc[:, ~test.columns.str.contains('^Unnamed')] # Remove incorrect column
test['Time'] = pd.to_datetime(test['Time']) # Convert to timestamp
for i in range(1,10): # Convert others to numeric
    test.iloc[:,i] = pd.to_numeric(test.iloc[:,i], errors='coerce')

# Simple imputation (last data)
nrow, ncol = test.shape
for r in range(nrow):
    for c in range(ncol):
        if test.iloc[r,c] != test.iloc[r,c]: # check if value is NaN
            test.iloc[r,c] = test.iloc[r-1,c]
            
test.head(3)

## 1. Treinamento do Modelo - Árvore de Decisão

In [None]:
### TRAIN MODEL ####
# Import Decision Tree for classification
from sklearn.tree import DecisionTreeClassifier
# Import Metrics
from sklearn.metrics import accuracy_score, f1_score

# Train model on training data using default parameters
model = DecisionTreeClassifier().fit(
    train[['Tp','Cl','pH','Redox','Leit','Trueb','Cl_2','Fm','Fm_2']],
    train['EVENT'])

# Predict on testing data
y_pred = model.predict(test[['Tp','Cl','pH','Redox','Leit','Trueb','Cl_2','Fm','Fm_2']])

# Print results
print('Accuracy: ' + '{:.2f}'.format(accuracy_score(test['EVENT'], y_pred)))

### Analyze Results

It is important to analyze the results to evaluate if the metric is telling us the truth.

**We are dealing with an imbalanced problem***. Thus, accuracy may not be a desirable metric.

We need to ***analyze the results with other metrics*** or visualization method.

To this end, we will use ***Confusion Matrix and F1 score***.

Let's also check the imbalance ratio.

### Signal Preprocessing
To improve the performance of the models, let's try to adjust the data.

One of the problems with this data set is concept drift.

To reduce this problem let's **detrend** the signal.

In [None]:
# Detrend signals
__, ncol = train.shape
for i in range(1, ncol-1):
    train.iloc[:,i] = train.iloc[:,i] - train.iloc[:,i].rolling(window=1440, min_periods=1).mean()
    test.iloc[:,i] = test.iloc[:,i] - test.iloc[:,i].rolling(window=1440, min_periods=1).mean()

### 2. Retreinamento do Modelo - Ávore de Decisão com dados ajustados

In [None]:
# Train model on training data using default parameters
model = DecisionTreeClassifier().fit(
    train[['Tp','Cl','pH','Redox','Leit','Trueb','Cl_2','Fm','Fm_2']],
    train['EVENT'])

# Predict on testing data
y_pred = model.predict(test[['Tp','Cl','pH','Redox','Leit','Trueb','Cl_2','Fm','Fm_2']])

# Print results
print('Accuracy: ' + '{:.2f}'.format(accuracy_score(test['EVENT'], y_pred)))

# Print F1 score
print('F1 score: ' + '{:.2f}'.format(f1_score(test['EVENT'], y_pred)))

# Plot results
plot_confusion_matrix(test['EVENT'], y_pred, ['Normal', 'Event']);