## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split

## Preprocessing

In [2]:
df_original = pd.read_csv('/content/mushrooms.csv')
df = df_original.copy()
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [4]:
# No missing values
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [9]:
# divide dataset into X & Y to use different encoding techniques
Y = df['class']
X = df.drop(columns=['class'])
print('X.shape =',X.shape)
print('X.shape =',Y.shape)

X.shape = (8124, 22)
X.shape = (8124,)


In [10]:
#selecting the categ columns
categ_columns = X.select_dtypes(include=['object']).columns
categ_var = X[categ_columns]

#ont hot encoding
enc = OneHotEncoder(handle_unknown='ignore').fit(categ_var)
transformed = enc.fit_transform(X[categ_columns]).toarray()
onehot_df = pd.DataFrame(transformed, columns=enc.get_feature_names())
X = pd.concat([X, onehot_df], axis=1)
X = X.drop(columns=categ_columns)
print(X)

      x0_b  x0_c  x0_f  x0_k  x0_s  x0_x  x1_f  x1_g  x1_s  x1_y  ...  x20_s  \
0      0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...    1.0   
1      0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...    0.0   
2      1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0  ...    0.0   
3      0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   1.0  ...    1.0   
4      0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...    0.0   
...    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...    ...   
8119   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   1.0   0.0  ...    0.0   
8120   0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...    0.0   
8121   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   1.0   0.0  ...    0.0   
8122   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   1.0  ...    0.0   
8123   0.0   0.0   0.0   0.0   0.0   1.0   0.0   0.0   1.0   0.0  ...    0.0   

      x20_v  x20_y  x21_d  x21_g  x21_l



In [12]:
le = LabelEncoder()
Y = le.fit_transform(Y)
print(Y)
print('Y.shape =', Y.shape)

[1 0 0 ... 0 1 0]
Y.shape = (8124,)


In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 100, stratify = Y)

In [16]:
X_dev, X_test, Y_dev, Y_test = train_test_split(X_test, Y_test, test_size = 0.5, random_state = 100, stratify = Y_test)

In [18]:
data = [X_train, Y_train, X_test, Y_test, X_dev, Y_dev]
for i in range(len(data)):
  if i %2 != 0: 
    data[i] = np.reshape(data[i],(1,data[i].shape[0]))
  else:
    data[i] = data[i].T
  print(data[i].shape)

(117, 6499)
(1, 6499)
(117, 407)
(1, 407)
(117, 406)
(1, 406)


## Build Network

## Initialization

In [19]:
def initialization(dim_array):
    """
    Arguments:
    dim_array -- python array (list) containing the dimensions of each layer in our network and dimension of input layer 1st
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl -- bias vector of shape (layer_dims[l], 1)
    """

    L = len(dim_array)            # number of layers in Network
    np.random.seed(3)
    parameters = {}               # Ws and bs 

    for i in range(1,L):
      parameters["W" + str(i)] = np.random.randn(dim_array[i], dim_array[i-1])*np.sqrt(2/dim_array[i-1])  # used Xavier Initialization to prevent exploding and vanishing vectors
      parameters["b" + str(i)] = np.zeros((dim_array[i],1))                                               # used normal distribution for random selection, reduce chance of extreme values to be selected, better for G.D

    return parameters

In [20]:
parameters = initialization([117,8,4,3,1])

## Forward Propagation

In [21]:
## Helper Functions

def sigmoid(Z):

  """
    Arguments:
    Z -- numpy array
    
    Returns:
    A -- output of sigmoid function
    cache -- contains argument Z
  """

  A = 1 / (1 + np.exp(-Z))
  cache = Z
  return A, cache

def relu(Z):

  """
    Arguments:
    Z -- numpy array
    
    Returns:
    A -- output of RELU function
    cache -- contains argument Z
  """
  A = np.maximum(0,Z)
  cache = Z
  return A, cache

In [145]:
def linear_forward(W,A,b):
  """
    Arguments:
    W -- numpy array representing weight
    A -- input of RELU function
    
    Returns:
    A -- output of sigmoid function
    cache -- contains argument Z
  """
  Z = np.dot(W,A) + b
  cache = (A, W, b)
  return Z, cache

In [146]:
def choose_activation(A_prev,W,b,activation):
  
  if activation == 'sigmoid':
    Z, linear_cache = linear_forward(W,A_prev,b)
    A, activation_cache = sigmoid(Z)

  elif activation == 'relu':
    Z, linear_cache = linear_forward(W,A_prev,b)
    A, activation_cache = relu(Z)
  cache = (linear_cache, activation_cache)
  return A,cache

In [135]:
def forward_propagation(X,parameters):
  L = len(parameters.keys())//2  
  A = X
  caches = []
  # LINEAR -- RELU 3 TIMES
  for i in range(1,L):
    A_prev = A
    A, relu_cache = choose_activation(                      A_prev,parameters["W" + str(i)],
                      parameters["b" + str(i)],
                      activation='relu')
    caches.append(relu_cache)

  # LINEAR -- SIGMOID 1 TIME
  #print(pd.Series(A[0]).describe())
  AL, sigmoid_cache = choose_activation(                        A_prev,parameters["W" + str(i)],
                        parameters["b" + str(i)],
                        activation='sigmoid')
  caches.append(sigmoid_cache)
  return AL, caches

In [148]:
AL, caches = forward_propagation(X_train.T,parameters)

In [138]:
pd.Series(AL[0]).describe()

count    6499.000000
mean        0.607726
std         0.065018
min         0.397974
25%         0.560229
50%         0.612193
75%         0.656759
max         0.750516
dtype: float64

## Cost

In [158]:
def cost_function(AL,y):

  m = y.shape[1]
  cost = -1/m*(np.sum(y*np.log(AL)+(1-y)*np.log(1-AL)))
  cost = np.squeeze(cost) 
  return cost

In [160]:
Y_train_reshaped = np.reshape(Y_train,(1,6499))
cost_function(AL,Y_train_reshaped)

0.6743943096030101

## BACKPRPO


In [162]:
dAL = - (np.divide(Y_train_reshaped, AL) - np.divide(1 - Y_train_reshaped, 1 - AL))


In [164]:
def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0
    
    
    return dZ

def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ




In [165]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = np.dot(dZ,A_prev.T)/m  
    db = np.sum(dZ,axis=1,keepdims=True)/m
    dA_prev = np.dot(W.T,dZ)

    return dA_prev, dW, db

In [166]:
def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        #(≈ 2 lines of code)
        dZ =relu_backward(dA, activation_cache) 
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
        
    elif activation == "sigmoid":
        #(≈ 2 lines of code)
        dZ =sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        # YOUR CODE STARTS HERE
 
        # YOUR CODE ENDS HERE
    
    return dA_prev, dW, db

In [167]:
def L_model_backward(AL, Y, caches):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    #(1 line of code)
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    # YOUR CODE STARTS HERE
    
    
    # YOUR CODE ENDS HERE
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]
    #(approx. 5 lines)
    current_cache = caches[L-1]
    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dAL, current_cache, activation='sigmoid')
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp
    # YOUR CODE STARTS HERE
    
    
    # YOUR CODE ENDS HERE
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        # Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 
        #(approx. 5 lines)
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, activation='relu')
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE

    return grads

In [169]:
L_model_backward(AL, Y_train_reshaped, caches)

{'dA0': array([[ 0.00029721,  0.23629374,  0.00926245, ...,  0.02566916,
          0.04186596, -0.00328531],
        [-0.01487763,  0.10760315,  0.04185676, ..., -0.00515917,
          0.02473801, -0.0050098 ],
        [-0.01184904, -0.01157508,  0.07654934, ...,  0.00308024,
          0.00348551, -0.003378  ],
        ...,
        [ 0.00025949,  0.0262632 ,  0.01271389, ..., -0.01032451,
          0.04053509, -0.00769537],
        [ 0.02903733, -0.154051  , -0.12261473, ...,  0.03085059,
         -0.08940307,  0.02499793],
        [-0.01001253, -0.07163556,  0.04959827, ..., -0.00592737,
         -0.023725  ,  0.00697942]]),
 'dA1': array([[ 0.09926384,  0.19612545, -0.39896974, ...,  0.0685301 ,
         -0.19279561,  0.04243101],
        [-0.04092394, -0.26831227,  0.16448501, ...,  0.01170603,
         -0.03293255, -0.00555087],
        [-0.05577438,  0.90358089,  0.22417317, ..., -0.11030084,
          0.31030916, -0.05703423],
        ...,
        [-0.08533081,  0.43118195,  0.34

## UPDATE

In [170]:
# GRADED FUNCTION: update_parameters

def update_parameters(params, grads, learning_rate):
    """
    Update parameters using gradient descent
    
    Arguments:
    params -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    parameters = params.copy()
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter. Use a for loop.
    #(≈ 2 lines of code)
    for l in range(L):
        parameters["W" + str(l+1)] =parameters["W" + str(l+1)]- (learning_rate * grads["dW" + str(l+1)])
        parameters["b" + str(l+1)] =parameters["b" + str(l+1)]-(learning_rate * grads["db" + str(l+1)])
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
    return parameters

## Model

In [175]:
# GRADED FUNCTION: L_layer_model

def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
    """
    Implements a L-layer neural network: [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID.
    
    Arguments:
    X -- data, numpy array of shape (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """

    np.random.seed(1)
    costs = []                         # keep track of cost
    
    # Parameters initialization.
    #(≈ 1 line of code)
    parameters = initialize_parameters_deep(layers_dims)
    # YOUR CODE STARTS HERE
    
    
    # YOUR CODE ENDS HERE
    
    # Loop (gradient descent)
    for i in range(0, num_iterations):

        # Forward propagation: [LINEAR -> RELU]*(L-1) -> LINEAR -> SIGMOID.
        #(≈ 1 line of code)
        AL, caches = L_model_forward(X, parameters)
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
        
        # Compute cost.
        #(≈ 1 line of code)
        cost = cost_function(AL, Y)
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
    
        # Backward propagation.
        #(≈ 1 line of code)
        grads = L_model_backward(AL, Y, caches)    
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
 
        # Update parameters.
        #(≈ 1 line of code)
        parameters = update_parameters(parameters, grads, learning_rate)
        # YOUR CODE STARTS HERE
        
        
        # YOUR CODE ENDS HERE
                
        # Print the cost every 100 iterations
        if print_cost and i % 100 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)
    
    return parameters, costs

In [180]:
parameters, costs = L_layer_model(X_train.T, Y_train_reshaped, [117,8,4,3,1], learning_rate = 0.0075, num_iterations = 6000, print_cost=True)

Cost after iteration 0: 0.6743943096030101
Cost after iteration 100: 0.5207215753833273
Cost after iteration 200: 0.45484010707503647
Cost after iteration 300: 0.40129964077413605
Cost after iteration 400: 0.3541069763427029
Cost after iteration 500: 0.3139106008880702
Cost after iteration 600: 0.2804891622201992
Cost after iteration 700: 0.25274389507005685
Cost after iteration 800: 0.22969505849954955
Cost after iteration 900: 0.2101619946001716
Cost after iteration 1000: 0.1933192201478848
Cost after iteration 1100: 0.17869786416700825
Cost after iteration 1200: 0.16590588495662079
Cost after iteration 1300: 0.15468011322124148
Cost after iteration 1400: 0.14476703377394126
Cost after iteration 1500: 0.13595283891604906
Cost after iteration 1600: 0.1280541671427569
Cost after iteration 1700: 0.12093462685232938
Cost after iteration 1800: 0.11447778885587374
Cost after iteration 1900: 0.10864165301984119
Cost after iteration 2000: 0.10333977520592091
Cost after iteration 2100: 0.0984

In [182]:
Y_dev.shape

(812,)

## Predict

In [None]:

def predict(parameters, X):
    """
    Using the learned parameters, predicts a class for each example in X
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (n_x, m)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    
    # Computes probabilities using forward propagation, and classifies to 0/1 using 0.5 as the threshold.
    #(≈ 2 lines of code)
    A2, cache = forward_propagation(X, parameters)
    predictions = np.round(A2)
    # YOUR CODE STARTS HERE
    
    
    # YOUR CODE ENDS HERE
    
    return predictions

In [188]:
AL_hat, cache = L_model_forward(X_dev.T, parameters)
Y_dev_reshaped = np.reshape(Y_dev,(1,812))
cost_function(AL_hat,Y_dev_reshaped)

0.02855582980978054

In [190]:
accuracy = float((np.dot(Y_dev_reshaped,AL_hat.T) + np.dot(1 - Y_dev_reshaped, 1 - AL_hat.T)) / float(Y_dev_reshaped.size)*100)
accuracy

97.23353663085206

In [None]:
pred_train = predict(X_train, Y_train, parameters)