In [65]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras.callbacks import ModelCheckpoint
import keras_tuner as kt
from keras.layers import LeakyReLU


#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("Resources/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


## Overall approach:

    In the first part of the challenge binning had been done for columns with greater than 10 unique values by selecting low frequency unique values and reducing them to one value named "other".  In "Deliverabl 3" the binning approach will be to collapse values in columns into bins.  Module 19 writes for the second binning method: "Create generalized categorical values and reassign all data points to the new corresponding values."  Here, for example, I will bin INCOME_AMT and ASK_AMT into categories.  Also, instead of using OneHotEncoder on all categorical colums, I will apply the pandas get_dummies method to categorical columns, dropping the unncessary column.  After preparing the database I will run the neural network.
    Next, I will create a keras function optimzer and choose neural network parameters.  I will run the neural network for an optimized set of parameters.  This optimized neural network will be first applied to the new dataset created above and then to the previous dataset created in "Deliverable 1" and "Deliverable 2."

Create a new dataset

In [66]:
# Drop columns that have no relevance
app_df = application_df.copy()
app_df = app_df.drop(columns = ["EIN", "NAME"])
app_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [67]:
# Use pandas get_dummies()
app_en_df = pd.get_dummies(app_df, columns = ["APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION",
                                             "USE_CASE","ORGANIZATION", "STATUS","SPECIAL_CONSIDERATIONS"],
                                              drop_first = True)
app_en_df

Unnamed: 0,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,...,CLASSIFICATION_C8210,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,STATUS_1,SPECIAL_CONSIDERATIONS_Y
0,0,5000,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,1-9999,108590,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,0,5000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,10000-24999,6692,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,100000-499999,142590,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,0,5000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
34295,0,5000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
34296,0,5000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
34297,0,5000,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [68]:
# Identify the ranges in "INCOME_AMT"
income_list = app_en_df["INCOME_AMT"].to_list()

income_range = [x for x in income_list if x != '0' ]

app_en_df["INCOME_AMT"].unique()

array(['0', '1-9999', '10000-24999', '100000-499999', '10M-50M',
       '25000-99999', '50M+', '1M-5M', '5M-10M'], dtype=object)

Let the categories be zero='0', sm_one='1-9999', sm_two='10000-24999', med_one='100000-499999',
med_two = '25000-99999', large='1M-5M', corp_one='5M-10M', corp_two='10M-50M', corp_three='50M+'.

In [69]:
# replace ranges with categories
income_cat ={'0':"zero", '1-9999':"sm_one", '10000-24999':" sm_two", '100000-499999':"med_one",
             '25000-99999':"med_two", '1M-5M':"large", '5M-10M': " corp_one",'10M-50M' : " corp_two",
             '50M+':"corp_three"}
app_en_df["INCOME_AMT"] = app_en_df["INCOME_AMT"].apply(lambda x: income_cat[x])
app_en_df.sample(20)

Unnamed: 0,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,...,CLASSIFICATION_C8210,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,STATUS_1,SPECIAL_CONSIDERATIONS_Y
30123,med_one,226456,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
14668,zero,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
25741,zero,5000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
27780,zero,5000,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
17789,zero,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
7524,zero,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
8683,sm_one,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
6918,zero,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
31747,zero,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
13871,zero,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


Looking at data, if the perctage of income_amt that are sucessful is small, then delete the rows where this is the case otherwise keep all income_amt data as-is.

In [70]:
# find percentage
zero_0 = app_en_df.loc[(app_en_df['INCOME_AMT']=='zero') & (app_en_df['IS_SUCCESSFUL']==0) ].count()
zero_1 = app_en_df.loc[(app_en_df['INCOME_AMT']=='zero') & (app_en_df['IS_SUCCESSFUL']==1) ].count()
percent_not_successful = (zero_0/(zero_0+zero_1))*100
percent_not_successful

INCOME_AMT                   48.429556
ASK_AMT                      48.429556
IS_SUCCESSFUL                48.429556
APPLICATION_TYPE_T12         48.429556
APPLICATION_TYPE_T13         48.429556
                               ...    
ORGANIZATION_Co-operative    48.429556
ORGANIZATION_Corporation     48.429556
ORGANIZATION_Trust           48.429556
STATUS_1                     48.429556
SPECIAL_CONSIDERATIONS_Y     48.429556
Length: 103, dtype: float64

Conclusion:  Keep all the zero income amounts since about one-half of them are successful.

Next change the above categories into integers from 0 to 8 in increasing income order.

In [71]:
# replace categories with integers
income_int ={"zero":0 ,"sm_one":1," sm_two":2, "med_one":3,
             "med_two":4,"large":5, " corp_one":6," corp_two":7,
             "corp_three":8}
app_en_df["INCOME_AMT"] = app_en_df["INCOME_AMT"].apply(lambda x: income_int[x])
app_en_df.sample(20)

Unnamed: 0,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,...,CLASSIFICATION_C8210,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,STATUS_1,SPECIAL_CONSIDERATIONS_Y
1125,3,8182,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
8954,0,5000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
26289,4,315204,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
34168,0,5000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1091,4,250525,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
25027,5,4031291,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
11943,7,114255640,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
17494,3,25725,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
31042,0,5000,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
8717,0,5000,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


Next do a statistical analysis of "ASK_AMT" and create bins

In [72]:
app_en_df["ASK_AMT"].describe()

count    3.429900e+04
mean     2.769199e+06
std      8.713045e+07
min      5.000000e+03
25%      5.000000e+03
50%      5.000000e+03
75%      7.742000e+03
max      8.597806e+09
Name: ASK_AMT, dtype: float64

Possible bins are (make 8 to be consistent with the income amount column):

5e3 - 7.7e3  (1)
7.7e3 - 7.7e4 (2)
7.7e4 - 7.7e5 (3)
7.7e5 - 2.7e6 (4)
2.7e6 - 2.7e7 (5)
2.7e7 - 2.7e8 (6)
2.7e8 - 2.7e9 (7)
2.7e9 - 8.7e9 (8)

In [73]:
ask_bins = [0,5.0e+03, 7.7e+03, 7.7e+04, 7.7e+05, 2.7e+06, 2.7e+07, 2.7e+08, 2.7e+09, 8.7e+09 ]
ask_labels = [0, 1, 2, 3, 4, 5 ,6 ,7 ,8]

app_en_df["ASK_AMT"] = pd.cut(app_en_df["ASK_AMT"], ask_bins, labels = ask_labels )
app_en_df.sample(20)

Unnamed: 0,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,...,CLASSIFICATION_C8210,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,STATUS_1,SPECIAL_CONSIDERATIONS_Y
10265,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
33512,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
9425,3,3,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
21091,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4793,4,2,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
9048,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
1453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
8207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
22161,4,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
20930,4,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


Check that all data is numeric

In [74]:
app_en_df.dtypes

INCOME_AMT                      int64
ASK_AMT                      category
IS_SUCCESSFUL                   int64
APPLICATION_TYPE_T12            uint8
APPLICATION_TYPE_T13            uint8
                               ...   
ORGANIZATION_Co-operative       uint8
ORGANIZATION_Corporation        uint8
ORGANIZATION_Trust              uint8
STATUS_1                        uint8
SPECIAL_CONSIDERATIONS_Y        uint8
Length: 103, dtype: object

change ASK_AMT to integer

In [75]:
app_en_df['ASK_AMT'] = app_en_df["ASK_AMT"].astype("int64")
app_en_df.dtypes

INCOME_AMT                   int64
ASK_AMT                      int64
IS_SUCCESSFUL                int64
APPLICATION_TYPE_T12         uint8
APPLICATION_TYPE_T13         uint8
                             ...  
ORGANIZATION_Co-operative    uint8
ORGANIZATION_Corporation     uint8
ORGANIZATION_Trust           uint8
STATUS_1                     uint8
SPECIAL_CONSIDERATIONS_Y     uint8
Length: 103, dtype: object

change all columns to int64

In [76]:
col_list = list(app_en_df.columns)
for col in col_list:
    app_en_df[col] = app_en_df[col].astype("int64")

In [77]:
 app_en_df.dtypes

INCOME_AMT                   int64
ASK_AMT                      int64
IS_SUCCESSFUL                int64
APPLICATION_TYPE_T12         int64
APPLICATION_TYPE_T13         int64
                             ...  
ORGANIZATION_Co-operative    int64
ORGANIZATION_Corporation     int64
ORGANIZATION_Trust           int64
STATUS_1                     int64
SPECIAL_CONSIDERATIONS_Y     int64
Length: 103, dtype: object

Use this database to make the feature and label sets

In [78]:
app_en_df.head()

Unnamed: 0,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_T12,APPLICATION_TYPE_T13,APPLICATION_TYPE_T14,APPLICATION_TYPE_T15,APPLICATION_TYPE_T17,APPLICATION_TYPE_T19,APPLICATION_TYPE_T2,...,CLASSIFICATION_C8210,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,STATUS_1,SPECIAL_CONSIDERATIONS_Y
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,1,3,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,2,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
4,3,3,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0


In [79]:
y = app_en_df["IS_SUCCESSFUL"].values
X= app_en_df.drop(["IS_SUCCESSFUL"], axis = 1).values

Now split into training and testing sets

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [81]:
X_train.shape

(25724, 102)

Now apply StandardScaler.  
The datasets X_train_scaled, X_test_scaled, Y_train, y_test are used for the following neural networks below:  
nn and nn_new.

In [82]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Now apply the same neural network as had been used dor deliverable 1 and 2.  
The name of this neural network is nn.

In [83]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_feat = len(X[0])
hidden_nodes_layer1=80
hidden_nodes_layer2=30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim = number_input_feat, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 80)                8240      
_________________________________________________________________
dense_8 (Dense)              (None, 30)                2430      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 31        
Total params: 10,701
Trainable params: 10,701
Non-trainable params: 0
_________________________________________________________________


In [84]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints2/",exist_ok=True)
checkpoint_path = "checkpoints2/weights2.{epoch:02d}.hdf5"

In [85]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [86]:
# Create a callback that saves the model's weights every 5 epochs
cp_callback2 = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq= 4000)

In [87]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 25, callbacks=[cp_callback2])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 00005: saving model to checkpoints2/weights2.05.hdf5
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 00010: saving model to checkpoints2/weights2.10.hdf5
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 00015: saving model to checkpoints2/weights2.15.hdf5
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 00020: saving model to checkpoints2/weights2.20.hdf5
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 00025: saving model to checkpoints2/weights2.25.hdf5


In [88]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5490 - accuracy: 0.7349
Loss: 0.5489561557769775, Accuracy: 0.7349271178245544


Next set up the keras optimizer to explore the hyperparameters.  
This is where the nueral network nn_new is created.

In [38]:
def keras_optimizer(hp):
    nn_new = tf.keras.models.Sequential()
    
    #choose activation function
    activation = hp.Choice('activation', ['LeakyReLU', 'relu', 'elu'] )
    
    #choose the number of neurons in the first layer
    nn_new.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=88,
        max_value =102 ,
        step = 2),activation = activation, input_dim = 102))
    
    # choose number of hidden layers and neurons
    for i in range(hp.Int("num_layers", 1, 3)):
        nn_new.add(tf.keras.layers.Dense(units=hp.Int("units_" + str(i),
                min_value = 10,
                max_value = 210,
                step=100),
                activation=activation))
    
    #set the ouput neuron
    nn_new.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    
    #compile the model
    nn_new.compile(optimizer="adam", 
        loss="binary_crossentropy", 
        metrics=["accuracy"]
        )
    
    return nn_new
    

In [41]:
# create the keras tuner object
tuner = kt.Hyperband(
    keras_optimizer,
    objective="val_accuracy",
    max_epochs = 2,
    hyperband_iterations=2,
    directory = "my_dir"
)


In [42]:
# run keras_optimizer
tuner.search(X_train_scaled, y_train, epochs=15, validation_data=(X_test_scaled, y_test))

Trial 4 Complete [00h 00m 03s]
val_accuracy: 0.7274635434150696

Best val_accuracy So Far: 0.727580189704895
Total elapsed time: 00h 00m 17s
INFO:tensorflow:Oracle triggered exit


In [43]:
# get the best hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'elu',
 'first_units': 90,
 'num_layers': 2,
 'units_0': 210,
 'tuner/epochs': 2,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0,
 'units_1': 10}

In [44]:
# evaluate the best model against the test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5636 - accuracy: 0.7276
Loss: 0.5635687708854675, Accuracy: 0.727580189704895


The keras tuner sugests using "elu" and two hidden layers with 210 and 10 nodes.  This accuracy was not improved, though.  The reason may be the large number of input nodes.  To test this a neural network will be formed as:

    input will be the smaller dataset worked with in deliverable 1 and 2 (ie smaller input dimension)
    activation will be set to "elu"
    first layer will have 210 nodes
    second layer will have 10 nodes
    other parameters will not be changed
    
    


The previous dataset is reproduced below

In [45]:
previous_df = application_df.copy()

In [47]:
previous_df.drop(columns = ["EIN","NAME"], inplace=True)

In [48]:
vc_application_type = previous_df["APPLICATION_TYPE"].value_counts()
vc_application_type

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T14        3
T25        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [49]:
# Determine which values to replace if counts are less than ...?
replace_application = vc_application_type[vc_application_type < 10000].index.to_list()

# Replace in dataframe
for app in replace_application:
    previous_df.APPLICATION_TYPE = previous_df.APPLICATION_TYPE.replace(app,"Other")
    
# Check to make sure binning was successful
previous_df.APPLICATION_TYPE.value_counts()

T3       27037
Other     7262
Name: APPLICATION_TYPE, dtype: int64

In [50]:
# Look at CLASSIFICATION value counts for binning
vc_classification = previous_df["CLASSIFICATION"].value_counts()
vc_classification

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1732        1
C2170        1
C2561        1
C4200        1
C1248        1
Name: CLASSIFICATION, Length: 71, dtype: int64

In [51]:
# Determine which values to replace if counts are less than ..?
replace_class = vc_classification[vc_classification < 1500].index.to_list()

# Replace in dataframe
for cls in replace_class:
    previous_df.CLASSIFICATION = previous_df.CLASSIFICATION.replace(cls,"Other")
    
# Check to make sure binning was successful
previous_df.CLASSIFICATION.value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [52]:
# Generate our categorical variable lists
previous_cat = previous_df.dtypes[previous_df.dtypes=='object'].index.to_list()
previous_cat

['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

In [53]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(previous_df[previous_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(previous_cat)
encode_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,CLASSIFICATION_C1000,CLASSIFICATION_C1200,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [54]:
# Merge one-hot encoded features and drop the originals
previous_df = previous_df.merge(encode_df, left_index=True, right_index=True)
previous_df = previous_df.drop(previous_cat, 1)
previous_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,108590,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,5000,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,6692,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,142590,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


The above is the dataset to which the optimized keras model will be applied.

In [55]:
# Split our preprocessed data into our features and target arrays
y_new = previous_df['IS_SUCCESSFUL'].values
X_new = previous_df.drop(columns = ['IS_SUCCESSFUL']).values

# Split the preprocessed data into a training and testing dataset
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, random_state = 42)

In [56]:
# Create a StandardScaler instances
scaler_new = StandardScaler()

# Fit the StandardScaler
X_scaler_new = scaler.fit(X_train_new)

# Scale the data
X_train_scaled_new = X_scaler_new.transform(X_train_new)
X_test_scaled_new = X_scaler_new.transform(X_test_new)

In [57]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_feat = len(X_new[0])
hidden_nodes_layer1_new=210
hidden_nodes_layer2_new=10

nn_optimized = tf.keras.models.Sequential()

# First hidden layer
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer1_new,
                input_dim = number_input_feat, activation="elu"))

# Second hidden layer
nn_optimized.add(tf.keras.layers.Dense(units=hidden_nodes_layer2_new, activation="elu"))

# Output layer
nn_optimized.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_optimized.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 210)               7770      
_________________________________________________________________
dense_5 (Dense)              (None, 10)                2110      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 9,891
Trainable params: 9,891
Non-trainable params: 0
_________________________________________________________________


In [58]:

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights3.{epoch:02d}.hdf5"

In [61]:
nn_optimized.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [62]:
# Create a callback that saves the model's weights every epoch
cp_callback3 = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq= 4660)

In [63]:
# Train the model
fit_model = nn_optimized.fit(X_train_scaled_new, y_train_new, epochs = 25, callbacks=[cp_callback3])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 00006: saving model to checkpoints/weights3.06.hdf5
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 00012: saving model to checkpoints/weights3.12.hdf5
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 00018: saving model to checkpoints/weights3.18.hdf5
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
138/804 [====>.........................] - ETA: 1s - loss: 0.5635 - accuracy: 0.7203
Epoch 00024: saving model to checkpoints/weights3.24.hdf5
Epoch 25/25


In [64]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_optimized.evaluate(X_test_scaled_new,y_test_new,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5765 - accuracy: 0.7102
Loss: 0.5764606595039368, Accuracy: 0.7102040648460388


In [89]:
nn_optimized.save("AlphabetSoupCharity_Optimized.h5")

Conclusion:  a dataset with about two-thirds reduction in the input dimensions did not improve neural network learning.

## Recommendation:  

## The results were not sensitive to the input dimension.  Simplifying the data by grouping the income amount and the ask amount into categorical data did increase the accuracy a little.  
## A smaller input dataset could be beneficial.  My next try would be to further reduce the input dataset size.  One way could be to implement unsupervised machine learning.  Using the principal component analysis, find the two largest clusters and match the cluster category with the columns of the dataset features.  Then delete all rows except for those which correspond to the two largest clusters.  In this way unsupervised machine learning could be used to reduce the dataset for neural network analysis.  An easier way to reduce the dataset size, though, is to discard all of the data that corresponds to a zero income amount.  If this is done, then the analysis will only be looking at the small, medium, large, and corporate accounts, but it will miss out on the data which is labeled zero income amount.  In other words, a higher accuracy may be achievable by looking at a subset of the original data.