#Model 2 : Decrease Number of Bins for APPLICATION_TYPE

## Preprocessing

In [67]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [68]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'], axis=1)

In [69]:
# Look at data type of each variable/column again
application_df.dtypes

Unnamed: 0,0
APPLICATION_TYPE,object
AFFILIATION,object
CLASSIFICATION,object
USE_CASE,object
ORGANIZATION,object
STATUS,int64
INCOME_AMT,object
SPECIAL_CONSIDERATIONS,object
ASK_AMT,int64
IS_SUCCESSFUL,int64


In [70]:
# Look at unique values in each categorical variable again to see if I need to create more or fewer bins
application_df[["APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION", "USE_CASE", "ORGANIZATION", "INCOME_AMT", "SPECIAL_CONSIDERATIONS"]].nunique()
# It looks like we've already split the categorical variables with the most number of unique values, which are APPLICATION_TYPE and CLASSIFICATION
# into bins.

Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2


From eyeballing the buckets in the first notebook, it looks like CLASSIFICATION does not have an excessive number of buckets. I will decrease the number of buckets for APPLICATION_TYPE to 5


In [71]:
# Look at frequency counts for APPLICATION_TYPE again
# Order the value counts of APPLICATION TYPE by highest counts to lowest counts
application_df["APPLICATION_TYPE"].value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [73]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_df["APPLICATION_TYPE"].value_counts()[application_df["APPLICATION_TYPE"].value_counts() < 1173].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
Other,3331
T4,1542
T6,1216
T5,1173


In [74]:
# You may find it helpful to look at CLASSIFICATION value counts > 1
# Use masking to obtain the entries in the CLASSIFICATION value counts which have counts > 1
# The conditional will produce an array/list of True/False values
# Indexing the frequency counts Series will produce frequency counts of values under CLASSIFICATION that are > 1
greater_1_fc = application_df["CLASSIFICATION"].value_counts()[application_df["CLASSIFICATION"].value_counts() > 1]
greater_1_fc

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
C7000,777
C1700,287
C4000,194
C5000,116
C1270,114


In [75]:
# Get entries in CLASSIFICATION value counts list 1 that are less than 1918 based on the frequency count above = list(application_df["CLASSIFICATION"].value_counts() < 1918)
classifications_to_replace = list(application_df["CLASSIFICATION"].value_counts()[application_df["CLASSIFICATION"].value_counts() < 1918].index)

In [76]:
# Same bins for CLASSIFICATION as first notebook
# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,4144
C3000,1918


In [77]:
application_df.dtypes

Unnamed: 0,0
APPLICATION_TYPE,object
AFFILIATION,object
CLASSIFICATION,object
USE_CASE,object
ORGANIZATION,object
STATUS,int64
INCOME_AMT,object
SPECIAL_CONSIDERATIONS,object
ASK_AMT,int64
IS_SUCCESSFUL,int64


In [78]:
# Convert categorical data to numeric with `pd.get_dummies`
# Extract columns that represent categorical variables from original dataframe
categorical_df = application_df[['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']]
categorical_df = pd.get_dummies(categorical_df, dtype=int)
categorical_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0


In [79]:
len(categorical_df.columns)

36

In [80]:
# Create new dataframe with dummy variables
# Copy of dataframe storing categorical columns
application_new_df = categorical_df.copy()
# Add noncategorical columns from original dataframe
application_new_df['STATUS'] = application_df['STATUS']
application_new_df['ASK_AMT'] = application_df['ASK_AMT']
application_new_df['IS_SUCCESSFUL'] = application_df['IS_SUCCESSFUL']

In [81]:
application_new_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,...,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,STATUS,ASK_AMT,IS_SUCCESSFUL
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,5000,1
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,108590,1
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,5000,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,6692,1
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,142590,1


In [82]:
# Split our preprocessed data into our features and target arrays
y = application_new_df["IS_SUCCESSFUL"]
X = application_new_df.drop(["IS_SUCCESSFUL"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [83]:
len(X_train.columns)

38

In [84]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model:

In [105]:
# NOTE: I'm keeping the model hyperparameters the same (same activation functions and same number of nodes in each layer as number of inputs), so I change one thing at a time
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  Input features: 38
#  Nodes for 1st layer: 38 (same as number of inputs)
#  Nodes for 2nd layer: Randomly pick 38
#  Nodes for output layer: 1
#  2 group classification functions: Linear, Tanh, Sigmoid?

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=38, activation="tanh", input_dim=38))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=38, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

In [86]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [87]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6889 - loss: 0.6060
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7194 - loss: 0.5733
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7259 - loss: 0.5650
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7194 - loss: 0.5671
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7252 - loss: 0.5631
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7267 - loss: 0.5594
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7241 - loss: 0.5633
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7221 - loss: 0.5628
Epoch 9/100
[1m804/804[0m [32

In [88]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 3ms/step - accuracy: 0.7219 - loss: 0.5677
Loss: 0.5677312016487122, Accuracy: 0.7218658924102783


# Model 3 : Increase Number of Neurons in Hidden Layer

## Preprocessing

In [89]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [90]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'], axis=1)

In [91]:
# Use the same number of bins for APPLICATION_TYPE as I did for the 1st notebook
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = ['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [129]:
# You may find it helpful to look at CLASSIFICATION value counts > 1
# Use masking to obtain the entries in the CLASSIFICATION value counts which have counts > 1
# The conditional will produce an array/list of True/False values
# Indexing the frequency counts Series will produce frequency counts of values under CLASSIFICATION that are > 1
greater_1_fc = application_df["CLASSIFICATION"].value_counts()[application_df["CLASSIFICATION"].value_counts() > 1]
greater_1_fc.head()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,4144
C3000,1918


In [94]:
# Use the same bins as I did for CLASSIFICATION as I did for the 1st notebook
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(application_df["CLASSIFICATION"].value_counts()[application_df["CLASSIFICATION"].value_counts() < 1918].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,4144
C3000,1918


In [95]:
# Convert categorical data to numeric with `pd.get_dummies`
# Extract columns that represent categorical variables from original dataframe
categorical_df = application_df[['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']]
categorical_df = pd.get_dummies(categorical_df, dtype=int)
categorical_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,AFFILIATION_CompanySponsored,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [96]:
# Create new dataframe with dummy variables
# Copy of dataframe storing categorical columns
application_new_df = categorical_df.copy()
# Add noncategorical columns from original dataframe
application_new_df['STATUS'] = application_df['STATUS']
application_new_df['ASK_AMT'] = application_df['ASK_AMT']
application_new_df['IS_SUCCESSFUL'] = application_df['IS_SUCCESSFUL']

In [97]:
application_new_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,AFFILIATION_CompanySponsored,...,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y,STATUS,ASK_AMT,IS_SUCCESSFUL
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,5000,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,108590,1
2,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,1,5000,0
3,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,6692,1
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,142590,1


In [98]:
# Split our preprocessed data into our features and target arrays
y = application_new_df["IS_SUCCESSFUL"]
X = application_new_df.drop(["IS_SUCCESSFUL"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [99]:
len(X_train.columns)

42

In [100]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [101]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  Input features: 42
#  Nodes for 1st layer: 42 * 3 = 126
#  Nodes for 2nd layer: 126 (Same as first hidden layer arbitrarily)
#  Nodes for output layer: 1
#  2 group classification functions: Linear, Tanh, Sigmoid?

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=126, activation="tanh", input_dim=42))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=126, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [102]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [103]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7127 - loss: 0.5914
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7227 - loss: 0.5663
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7297 - loss: 0.5549
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7286 - loss: 0.5569
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7297 - loss: 0.5547
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7291 - loss: 0.5545
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7266 - loss: 0.5535
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7317 - loss: 0.5492
Epoch 9/100
[1m804/804[0m [32

In [104]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 2ms/step - accuracy: 0.7266 - loss: 0.5615
Loss: 0.5614814162254333, Accuracy: 0.7266472578048706


# Model 4 : Change the Activation Function of One or All Hidden Layers

## Preprocessing
### Not needed because I can use the same preprocessed data as Model 3



## Compile, Train and Evaluate the Model:

In [109]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  Input features: 42
#  Nodes for 1st layer: 42 (same as number of inputs)
#  Nodes for 2nd layer: Randomly pick 42
#  Nodes for output layer: 1
#  2 group classification functions: Linear, Tanh, Sigmoid?

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=42, activation="leaky_relu", input_dim=42))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=42, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

In [110]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [111]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6958 - loss: 0.5991
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7265 - loss: 0.5624
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7319 - loss: 0.5513
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7282 - loss: 0.5537
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7297 - loss: 0.5537
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7270 - loss: 0.5557
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7283 - loss: 0.5533
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7295 - loss: 0.5516
Epoch 9/100
[1m804/804[0m [32

In [112]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 3ms/step - accuracy: 0.7248 - loss: 0.5588
Loss: 0.5588364601135254, Accuracy: 0.724781334400177


# Model 5 : Combine all changes above (number of bins for APPLICATION_TYPE, number of Nodes for each hidden layer, activation Function)

## Preprocessing

In [113]:
# Read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [114]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(['EIN', 'NAME'], axis=1)

In [115]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_df["APPLICATION_TYPE"].value_counts()[application_df["APPLICATION_TYPE"].value_counts() < 1173].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
Other,3331
T4,1542
T6,1216
T5,1173


In [128]:
# You may find it helpful to look at CLASSIFICATION value counts > 1
# Use masking to obtain the entries in the CLASSIFICATION value counts which have counts > 1
# The conditional will produce an array/list of True/False values
# Indexing the frequency counts Series will produce frequency counts of values under CLASSIFICATION that are > 1
greater_1_fc = application_df["CLASSIFICATION"].value_counts()[application_df["CLASSIFICATION"].value_counts() > 1]
greater_1_fc.head()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,4144
C3000,1918


In [117]:
# Get entries in CLASSIFICATION value counts list 1 that are less than 1918 based on the frequency count above = list(application_df["CLASSIFICATION"].value_counts() < 1918)
classifications_to_replace = list(application_df["CLASSIFICATION"].value_counts()[application_df["CLASSIFICATION"].value_counts() < 1918].index)

In [118]:
# Same bins for CLASSIFICATION as first notebook
# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,4144
C3000,1918


In [119]:
# Convert categorical data to numeric with `pd.get_dummies`
# Extract columns that represent categorical variables from original dataframe
categorical_df = application_df[['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS']]
categorical_df = pd.get_dummies(categorical_df, dtype=int)
categorical_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0


In [120]:
# Create new dataframe with dummy variables
# Copy of dataframe storing categorical columns
application_new_df = categorical_df.copy()
# Add noncategorical columns from original dataframe
application_new_df['STATUS'] = application_df['STATUS']
application_new_df['ASK_AMT'] = application_df['ASK_AMT']
application_new_df['IS_SUCCESSFUL'] = application_df['IS_SUCCESSFUL']

In [121]:
# Split our preprocessed data into our features and target arrays
y = application_new_df["IS_SUCCESSFUL"]
X = application_new_df.drop(["IS_SUCCESSFUL"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [122]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [123]:
len(X_train.columns)

38

## Compile, Train and Evaluate the Model

In [124]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  Input features: 38
#  Nodes for 1st layer: 38 * 3 = 114
#  Nodes for 2nd layer: 114 (Same as first hidden layer arbitrarily)
#  Nodes for output layer: 1
#  2 group classification functions: Linear, Tanh, Sigmoid?

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=114, activation="leaky_relu", input_dim=38))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=114, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [125]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [126]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6963 - loss: 0.6030
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7149 - loss: 0.5747
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7177 - loss: 0.5756
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7201 - loss: 0.5687
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7203 - loss: 0.5660
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7255 - loss: 0.5639
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7225 - loss: 0.5627
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7242 - loss: 0.5645
Epoch 9/100
[1m804/804[0m [32

In [127]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7197 - loss: 0.5672
Loss: 0.5672426223754883, Accuracy: 0.719650149345398
