## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras import regularizers
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
df.head()




Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Start be creating a copy for the preprocessing to occur
pre = df.copy()
# Drop off the id column
pre = pre.drop(['EIN'],axis=1)

# Define a function to bin categorical variables
# First, we want to take in the Series for binning and the cutoff value
def cat_binner(Series, cutoff_val):
    # Save the value_counts for the Series in a variable 'counts'
    counts = Series.value_counts()
    # Save a list of the unique values that do not have more entries than the cutoff_val
    counts_to_replace = list(counts[counts < cutoff_val].index)
    # Run a for loop through those values and bin them as "Other"
    for count in counts_to_replace:
        Series = Series.replace(count,"Other")
    # Return the Series
    return Series

# Use our cat_binner() function to bin categorical data
# The goal is to capture as much of the variance of the model as possible without 
# overfitting the model as to maximize testing accuracy.
pre["NAME"] = cat_binner(pre["NAME"], 2)
pre["APPLICATION_TYPE"] = cat_binner(pre["APPLICATION_TYPE"], 1000)
pre["CLASSIFICATION"] = cat_binner(pre["CLASSIFICATION"], 700)
pre["AFFILIATION"] = cat_binner(pre["AFFILIATION"], 700)
pre["USE_CASE"] = cat_binner(pre["USE_CASE"], 700)

# Next, we want to reduce variation by binning significant INCOME_AMT data in 4 bins rather than 10
# Define the value ranges to evaluate the bins
# 0 ~ 0
# low - 1
loBin = ['1-9999','10000-24999','25000-99999']
# mid -2 
miBin = ['100000-499999','1M-5M']
# high - 3
hiBin = ['5M-10M','10M-50M','50M+']

# Define a function to bin the INCOME_AMT variance
# The function requires the list of values in each bin, the data, and the number being assigned
def quant_binner(bin, df, assign):
    # Run a for loop through the value range
    for y in bin:
        # Wherever the INCOME_AMT matches a value in the range, we assign the assigned value
        df.loc[df["INCOME_AMT"] == y, "INCOME_AMT" ] = assign
    # Return the data
    return df
# Push this function through our data to bin INCOME_AMT
pre = quant_binner(loBin, pre, 1)    
pre = quant_binner(miBin, pre, 2)
pre = quant_binner(hiBin, pre, 3)   

# List comprehension for all the column names
pre_cols = [ str(x) for x in pre.columns ]
# List comprehension for the object columns
obj_cols = [x for x in pre_cols if pre[x].dtype == 'object']
# List comprehension for the integer columns
int_cols = [x for x in pre_cols if pre[x].dtype == 'int64']
# Encode the object columns with dummies
obj_dummies = pd.get_dummies(pre[obj_cols], dtype=int)
# Save DataFrame object of all integer columns
int_dummies = pre[int_cols]
# Concat dummies with integer columns
pro = pd.concat([obj_dummies,int_dummies], axis=1)
# Print for clarification
pro.head()

Unnamed: 0,NAME_4-H CLUBS & AFFILIATED 4-H ORGANIZATIONS,NAME_AACE INTERNATIONAL,NAME_ACADEMY OF GENERAL DENTISTRY,NAME_ACADEMY OF MANAGED CARE PHARMACY,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_ACTIVE 20-30 UNITED STATES AND CANADA,NAME_ACTS MINISTRY,NAME_ACTS MISSIONS,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AGENTS ASSOCIATION,...,ORGANIZATION_Trust,INCOME_AMT_1,INCOME_AMT_2,INCOME_AMT_3,INCOME_AMT_0,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,STATUS,ASK_AMT,IS_SUCCESSFUL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,5000,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,108590,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,5000,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,1,0,1,6692,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,1,142590,1


## Compile, Train, Evaluate and Optimize the Model

In [3]:
# Train Test Split 
# Training data is everything except the predicted value
X = pro.drop("IS_SUCCESSFUL",axis=1)
# Save predicted value into y variable
y = pro["IS_SUCCESSFUL"]
# Use train_test_split to nicely split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale Training Data
# Define the scalar object
scaler = StandardScaler()
# Fit scalar with training data
X_scaler = scaler.fit(X_train)
# Scale training set
X_train_scaled = X_scaler.transform(X_train)
# Scale testing set
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# Define the NN as a feed-forward model
hard_nn = tf.keras.models.Sequential(
    [ 
        # Create the input layer and 1st hidden layer
        tf.keras.layers.Dense(
            # Allot 50 neurons to the 1st hidden layer
            units = 80,
            # Activation Function: ReLU
            activation = 'relu',
            # Define the input layer to be the length of first row in training set
            input_dim = len(X_train_scaled[0]),
        ),
        # Create the 2nd hidden layer
        tf.keras.layers.Dense(
            # ALlot 20 neurons to the 2nd layer
            units = 20,
            # Activation Function: Sigmoid
            activation = 'sigmoid',
        ),
        # Create the output layer
        tf.keras.layers.Dense(
            # ALlot 1 neuron to the output layer (All-or-nothing principle)
            units = 1,
            # Activate Function falls on the sigmoid curve (val >= 15 : 1)
            activation = 'sigmoid',
        ),
    ],
)
# Compile the model
hard_nn.compile(
    # Binary cross entropy is natural selection for a binary classifer
    loss = "binary_crossentropy",
    # RMSprop is an optimizer similar to Adam, preforming better in this case
    # One reason for this outperformance could be RMSprop is faster to change 
    # direction on the gradient descent, making it more flexible
    optimizer = "RMSprop",
    # Evaluate model on accuracy using test data
    metrics =  ["accuracy"]
)
# Fit the model with out preprocessed data and train it 
fit = hard_nn.fit(
    X_train_scaled, 
    y_train, 
    # Minimal loss change found after 60 epochs
    epochs=60
)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [25]:
# Evaluate the accuracy of the model
model_loss, model_accuracy = hard_nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - loss: 0.4365 - accuracy: 0.7993 - 570ms/epoch - 2ms/step
Loss: 0.4365272521972656, Accuracy: 0.7993003129959106


In [28]:
# Save model as h5 file
hard_nn.save('models/nn_AlphabetSoup.h5')