In [146]:
from matplotlib import pyplot as plt
from utils_Function import *
from utils_Optimizer import *
import seaborn as sn
import numpy as np

In [147]:
import pandas as pd

data = pd.read_csv('../water_potability.csv')
data.head()

# delete nan values
# data.fillna(0)
data = data.dropna()
data.head()
y = data['Potability'].values
X = data.drop(['Potability'], axis=1).values

print(X.shape, y.shape)

(2011, 9) (2011,)


In [148]:
# X_train X_test _ X_val
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, X_val.shape)
train_y = y_train.reshape(-1, 1).T
test_y = y_test.reshape(-1, 1).T
valid_y = y_val.reshape(-1, 1).T
print(train_y.shape, test_y.shape, valid_y.shape)

(1286, 9) (403, 9) (322, 9)
(1, 1286) (1, 403) (1, 322)


In [149]:
X_min = X_train.min(axis=0, keepdims=True)
X_max = X_train.max(axis=0, keepdims=True)

In [150]:
X_train = (X_train - X_min)/(X_max - X_min)
X_val = (X_val - X_min)/(X_max - X_min)
X_test= (X_test - X_min)/(X_max - X_min)

In [151]:
def min_max_normalization(X):
    return (X - np.min(X)) / (np.max(X) - np.min(X))

In [152]:
train_x_flatten = X_train.T   
valid_x_flatten = X_val.T   
test_x_flatten = X_test.T

train_x = min_max_normalization(train_x_flatten)
valid_x = min_max_normalization(valid_x_flatten)
test_x = min_max_normalization(test_x_flatten)

print ("train_x's shape: " + str(train_x.shape))
print ("valid_x's shape: " + str(valid_x.shape))
print ("test_x's shape: " + str(test_x.shape))

train_x's shape: (9, 1286)
valid_x's shape: (9, 322)
test_x's shape: (9, 403)


In [153]:
X_train.shape

(1286, 9)

In [154]:
y_train

array([0, 1, 0, ..., 1, 1, 0])

In [155]:
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape, test_x.shape, test_y.shape)

(9, 1286) (1, 1286) (9, 322) (1, 322) (9, 403) (1, 403)


In [156]:
layers_dims = [train_x.shape[0], 5, 1] 
keep_probs = [0, 0.2, 0.2]

In [157]:
def L_layer_model(X_train, Y_train, X_valid, Y_valid, 
                  layers_dims, 
                  lambd=0.5, 
                  drop_out=True, 
                  keep_probs=keep_probs, 
                  learning_rate = 0.0055, 
                  mini_batch_size=64, 
                  beta=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8, 
                  num_iterations = 100, 
                  print_cost=False):

    costs_train = []       
    costs_valid = []                 
    t = 0
    parameters = initialize_parameters_deep(layers_dims)
    
    v, s = initialize_adam(parameters)
 
    for i in range(0, num_iterations):
        
        minibatches_train = random_mini_batches(X_train, Y_train, mini_batch_size)
        for minibatch in minibatches_train:
            
            (minibatch_X, minibatch_Y) = minibatch

            AL_train, caches = L_model_forward(minibatch_X, parameters, keep_probs=keep_probs, drop_out=drop_out)
            cost_train = compute_cost(AL_train, minibatch_Y, parameters, lambd)
            
            AL_valid, _ = L_model_forward(X_valid, parameters, keep_probs=keep_probs, drop_out=False)
            cost_valid = compute_cost(AL_valid, Y_valid, parameters, lambd)

            grads = L_model_backward(AL_train, minibatch_Y, caches, lambd, drop_out=drop_out)

            t = t + 1 
            parameters, v, s = update_parameters_with_adam(parameters, 
                                                           grads, 
                                                           v, s,
                                                           t, 
                                                           learning_rate, 
                                                           beta1, beta2, epsilon)

        if print_cost and i % 10 == 0:
            print("Train_cost after iteration %i: %f" %(i, cost_train))
            print("Valid_cost after iteration %i: %f" %(i, cost_valid))
            print("---------------------------------------------")
        if print_cost and i % 10 == 0:
            costs_train.append(cost_train)
            costs_valid.append(cost_valid)
            
    plt.plot(np.squeeze(costs_train), color='blue', label='Train_cost')
    plt.plot(np.squeeze(costs_valid), color='orange', label='Val_cost')
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.legend(loc='lower left', shadow=True)
    plt.show()
    
    return parameters, cost_train, cost_valid

In [None]:
parameters, costs_train, costs_valid = L_layer_model(train_x, train_y, 
                                                     valid_x, valid_y, 
                                                     layers_dims, 
                                                     lambd=0.1, 
                                                     num_iterations = 1000, 
                                                     print_cost = True)

In [None]:
def predict(X, y, parameters):
    m = X.shape[1]
    n = len(parameters) // 2 
    
    probas, caches = L_model_forward(X, parameters, keep_probs= keep_probs, drop_out= False)

    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.6:
            probas[0,i] = 1
        else:
            probas[0,i] = 0
    
    print("Accuracy: "  + str(np.sum((probas == y)/m)))
        
    return probas
y_pred = predict(test_x, test_y, parameters)

In [160]:
class evaluate():
    def __init__(self, predict, real):
        self.predict = predict
        self.real = real

        # Positive = 1, Negative = 0
        # True Positive: predict = real = Possitive(1)
        self.TP = ((self.predict*self.real) == 1).sum()
        # False Negative: predict = Negative(0) but real = Positive(1)
        self.FN = ((self.real - self.predict) == 1).sum()
        # False Positive: predict = Positive(1) but real = Negative(0)
        self.FP = ((self.predict - self.real) == 1).sum()
        # True Negative: predict = real = Negative(0) 
        self.TN = ((self.predict + self.real) == 0).sum()
    
    def plot_confusion_matrix(self):
        confusion_matrix = np.array([[self.TP, self.FN], [self.FP, self.TN]])
        sn.heatmap(confusion_matrix, annot =True)
        plt.title("Positive = 0, Negative = 1")
        plt.xlabel("Predict")
        plt.ylabel("Actual")
        

    def Accuracy(self):
        # accuracy = True / Total 
        return (self.TP + self.TN)/(self.TP + self.FN + self.FP + self.TN)

    def Precision(self):
        # trong số những dữ liệu dự đoán là positive có bao nhiêu cái là đúng
        return self.TP / (self.TP + self.FP)

    def Recall(self):
        # trong số những dữ liệu thực sự là positive dự đoán đúng bao nhiêu
        return self.TP / (self.TP + self.FN)

    def F1_Score(self):
        return (2*self.TP) / (2*self.TP + self.FP + self.FN)

In [None]:
check = evaluate(y_pred, test_y)
check.plot_confusion_matrix()

In [162]:
print("Accuracy",check.Accuracy())
print("Precision",check.Precision())
print("Recall",check.Recall())
print("F1_Score",check.F1_Score())

Accuracy 0.5732009925558312
Precision nan
Recall 0.0
F1_Score 0.0


  return self.TP / (self.TP + self.FP)


In [163]:


import joblib

# Lưu trạng thái của mô hình
joblib.dump(parameters, '../nuoc.h5')




['../nuoc.h5']