In [1]:
import time
import numpy as np
import h5py
import matplotlib.pyplot as plt
import scipy
import pandas as pd
from sklearn import linear_model, svm, model_selection, metrics
from sklearn.utils import shuffle

seed = 42

np.random.seed(seed)


df = pd.read_csv('C:/Users/171475/train.csv')
OutOfSample = pd.read_csv('C:/Users/171475/test.csv')

df['var3'] = df['var3'].replace(-999999, 2)


train_df, test_df = model_selection.train_test_split(df, test_size=0.2, random_state=seed)

train_1 = train_df[train_df["TARGET"] == 1]
train_0 = train_df[train_df["TARGET"] == 0]
number_of_0_samples = int(np.floor(train_1.shape[0]*6/4))
train_0_balanced = train_0.sample(n=number_of_0_samples, random_state = seed)

train_balanced = shuffle(pd.concat([train_1, train_0_balanced]))

number_of_ones = sum(train_balanced['TARGET'])
number_of_zeros = train_balanced.shape[0] - sum(train_balanced['TARGET'])

train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

OutOfSample.fillna(0, inplace=True)

train_y = train_balanced[["TARGET"]].T
test_y = test_df[["TARGET"]].T

m_train = train_balanced.shape[0]
m_test = test_df.shape[0]

some_columns = (train_balanced.max() != 0) & (train_balanced.min() >= 0)
train_x_orig = train_balanced.loc[:, some_columns]
test_x_orig = test_df.loc[:, some_columns]


print ("Number of training examples: " + str(m_train))
print ("Number of testing examples: " + str(m_test))
print ("train_x_orig shape: " + str(train_x_orig.shape))
print ("in training set number of zeros is: {} and number of ones: {} ".format(number_of_zeros, number_of_ones))
print ("train_y shape: " + str(train_y.shape))
print ("test_x_orig shape: " + str(test_x_orig.shape))
print ("test_y shape: " + str(test_y.shape))

max_of_train = train_x_orig.max()
train_x = train_x_orig / max_of_train
train_x = train_x.T

test_x = test_x_orig / max_of_train
test_x = test_x.T

print ("train_x's shape: " + str(train_x.shape))
print ("test_x's shape: " + str(test_x.shape))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Number of training examples: 6002
Number of testing examples: 15204
train_x_orig shape: (6002, 244)
in training set number of zeros is: 3601 and number of ones: 2401 
train_y shape: (1, 6002)
test_x_orig shape: (15204, 244)
test_y shape: (1, 15204)
train_x's shape: (244, 6002)
test_x's shape: (244, 15204)


In [2]:
def sigmoid(Z):
    """
    Implements the sigmoid activation in numpy

    Arguments:
    Z -- numpy array of any shape

    Returns:
    A -- output of sigmoid(z), same shape as Z
    cache -- returns Z as well, useful during backpropagation
    """

    A = 1 / (1 + np.exp(-Z))
    cache = Z

    return A, cache


In [3]:
def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """

    Z = cache

    s = 1 / (1 + np.exp(-Z))
    dZ = dA * s * (1 - s)

    assert (dZ.shape == Z.shape)

    return dZ


In [4]:
def relu(Z):
    """
    Implement the RELU function.

    Arguments:
    Z -- Output of the linear layer, of any shape

    Returns:
    A -- Post-activation parameter, of the same shape as Z
    cache -- a python dictionary containing "A" ; stored for computing the backward pass efficiently
    """

    A = np.maximum(0, Z)

    assert (A.shape == Z.shape)

    cache = Z
    return A, cache

In [5]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """

    Z = cache
    dZ = np.array(dA, copy=True)  # just converting dz to a correct object.

    # When z <= 0, you should set dz to 0 as well.
    dZ[Z <= 0] = 0

    assert (dZ.shape == Z.shape)

    return dZ

In [6]:
def initialize_parameters(n_x, n_h, n_y):
    """
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer

    Returns:
    parameters -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    """

    np.random.seed(seed)

    W1 = np.random.randn(n_h, n_x) * 0.1
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.1
    b2 = np.zeros((n_y, 1))

    assert (W1.shape == (n_h, n_x))
    assert (b1.shape == (n_h, 1))
    assert (W2.shape == (n_y, n_h))
    assert (b2.shape == (n_y, 1))

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}

    return parameters

In [7]:
parameters = initialize_parameters(3, 2, 1)
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.04967142 -0.01382643  0.06476885]
 [ 0.15230299 -0.02341534 -0.0234137 ]]
b1 = [[ 0.]
 [ 0.]]
W2 = [[ 0.15792128  0.07674347]]
b2 = [[ 0.]]


In [8]:
def linear_forward(A, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter
    cache -- a python dictionary containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """

    Z = np.dot(W, A) + b


    assert (Z.shape == (W.shape[0], A.shape[1]))
    cache = (A, W, b)

    return Z, cache


In [9]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value
    cache -- a python dictionary containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """

    if activation == "sigmoid":

        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)


    elif activation == "relu":

        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)


    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

In [10]:
def compute_cost(AL, Y):
    """
    Implement the cost function defined by equation (7).

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """

    m = Y.shape[1]

    # Compute loss from aL and y.

    cost = -1 / m * (np.dot(np.log(AL), Y.T) + np.dot(np.log(1 - AL), 1 - Y.T))

    cost = np.squeeze(cost)  # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert (cost.shape == ())

    return cost



In [11]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]


    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)


    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)

    return dA_prev, dW, db


In [12]:
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.

    Arguments:
    dA -- post-activation gradient for current layer l
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache

    if activation == "relu":

        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)


    elif activation == "sigmoid":

        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)


    return dA_prev, dW, db


In [13]:
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent

    Arguments:
    parameters -- python dictionary containing your parameters
    grads -- python dictionary containing your gradients, output of L_model_backward

    Returns:
    parameters -- python dictionary containing your updated parameters
                  parameters["W" + str(l)] = ...
                  parameters["b" + str(l)] = ...
    """

    parameters['W1'] = parameters["W1"] - learning_rate * grads["dW1"]
    parameters['b1'] = parameters["b1"] - learning_rate * grads["db1"]
    parameters['W2'] = parameters["W2"] - learning_rate * grads["dW2"]
    parameters['b2'] = parameters["b2"] - learning_rate * grads["db2"]

    return parameters

In [14]:
def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.

    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model

    Returns:
    p -- predictions for the given dataset X
    """

    m = X.shape[1]
    
    p = np.zeros((1, m))

    # Forward propagation
    W1 = parameters['W1']
    b1 = parameters['b2']
    W2 = parameters['W2']
    b2 = parameters['b2']
    A1, cache1 = linear_activation_forward(X, W1, b1, 'relu')
    probas, cache2 = linear_activation_forward(A1, W2, b2, 'sigmoid')
    # probas, caches = L_model_forward(X, parameters)

    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0, i] > 0.5:
            p[0, i] = 1
        else:
            p[0, i] = 0

    # print results
    # print ("predictions: " + str(p))
    # print ("true labels: " + str(y))
    print("Accuracy: " + str(np.sum((p == y) / m)))

    return p, probas

In [None]:
def two_layer_model(X, Y, layers_dims, learning_rate = 0.001, num_iterations = 4500, print_cost=False):
    """
    Implements a two-layer neural network: LINEAR->RELU->LINEAR->SIGMOID.
    
    Arguments:
    X -- input data, of shape (n_x, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- dimensions of the layers (n_x, n_h, n_y)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- If set to True, this will print the cost every 100 iterations 
    
    Returns:
    parameters -- a dictionary containing W1, W2, b1, and b2
    """
    
    np.random.seed(seed)
    grads = {}
    costs = []                              # to keep track of the cost
    m = X.shape[1]                           # number of examples
    (n_x, n_h, n_y) = layers_dims

    parameters = initialize_parameters(n_x, n_h, n_y)

    # Get W1, b1, W2 and b2 from the dictionary parameters.
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    # Loop (gradient descent)

    for i in range(0, num_iterations):

        A1, cache1 = linear_activation_forward(X, W1, b1, 'relu')
        A2, cache2 = linear_activation_forward(A1, W2, b2, 'sigmoid')
        
        # Compute cost

        cost = compute_cost(A2, Y)
        
        # Initializing backward propagation
        dA2 = - (np.divide(Y, A2) - np.divide(1 - Y, 1 - A2))
        
        # Backward propagation. Inputs: "dA2, cache2, cache1". Outputs: "dA1, dW2, db2; also dA0 (not used), dW1, db1".

        dA1, dW2, db2 = linear_activation_backward(dA2, cache2, 'relu')
        dA0, dW1, db1 = linear_activation_backward(dA1, cache1, 'sigmoid')
        
        # Set grads['dWl'] to dW1, grads['db1'] to db1, grads['dW2'] to dW2, grads['db2'] to db2
        grads['dW1'] = dW1
        grads['db1'] = db1
        grads['dW2'] = dW2
        grads['db2'] = db2
        
        # Update parameters.
        parameters = update_parameters(parameters, grads, learning_rate)

        # Retrieve W1, b1, W2, b2 from parameters
        W1 = parameters["W1"]
        b1 = parameters["b1"]
        W2 = parameters["W2"]
        b2 = parameters["b2"]
        
        # Print the cost every 100 training example
        if print_cost and i % 100 == 0:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
            print("Gini after iteration {}: {}".format(i, (2*metrics.roc_auc_score(Y.T, A2.T))-1))
            predictions_test, probability_test = predict(np.array(test_x), np.array(test_y), parameters)
            print("Gini after iteration in test set {}: {}".format(i, (2*metrics.roc_auc_score(test_y.T, probability_test.T))-1))
        if print_cost and i % 100 == 0:
            costs.append(cost)
       
    # plot the cost

    plt.plot(np.squeeze(costs))
    plt.ylabel('cost')
    plt.xlabel('iterations (per tens)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters


In [None]:
n_x = train_x.shape[0]
n_h = 50
n_y = 1
layers_dims = (n_x, n_h, n_y)

parameters = two_layer_model(np.array(train_x), np.array(train_y), layers_dims=(n_x, n_h, n_y),num_iterations = 6000, print_cost=True)

predictions_train, probability_train = predict(np.array(train_x), np.array(train_y), parameters)

Cost after iteration 0: 0.6957581420684308
Gini after iteration 0: 0.3288103945396261
Accuracy: 0.161141804788
Gini after iteration in test set 0: 0.2985042739142423
Cost after iteration 100: 0.6838865707251125
Gini after iteration 100: 0.5365818255167909
Accuracy: 0.422717705867
Gini after iteration in test set 100: 0.5197083555906581
Cost after iteration 200: 0.6782064457953941
Gini after iteration 200: 0.6694379285868692
Accuracy: 0.471323335964
Gini after iteration in test set 200: 0.6442790991220579
Cost after iteration 300: 0.6735370380284267
Gini after iteration 300: 0.7638163585685449
Accuracy: 0.476913970008
Gini after iteration in test set 300: 0.7347129281941553
Cost after iteration 400: 0.6690643459361667
Gini after iteration 400: 0.8292230130438341
Accuracy: 0.477045514338
Gini after iteration in test set 400: 0.8008426050398072
Cost after iteration 500: 0.6648809656484448
Gini after iteration 500: 0.8759553694245468
Accuracy: 0.475401210208
Gini after iteration in test se

In [None]:
predictions_train, probability_train = predict(np.array(train_x), np.array(train_y), parameters)