In [14]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")


In [15]:
application_df

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...,...,...
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [16]:
# Drop the 'EIN' and 'NAME' columns
application_df = application_df.drop(columns=['EIN', 'NAME'])

# Get the value counts of CLASSIFICATION
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Display the value counts greater than 1
classification_counts_gt1 = classification_counts[classification_counts > 1]
print(classification_counts_gt1)

# Choose a cutoff value (e.g., any value count less than 1000)
cutoff_value = 1000

# Create a list of classifications to be replaced
classifications_to_replace = classification_counts_gt1[classification_counts_gt1 < cutoff_value].index.tolist()

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls, "Other")

# Check to make sure replacement was successful
print(application_df['CLASSIFICATION'].value_counts())

# Convert categorical data to numeric with pd.get_dummies
application_df = pd.get_dummies(application_df)

# Define the target (y) and features (X)
y = application_df['IS_SUCCESSFUL'].values
X = application_df.drop(columns=['IS_SUCCESSFUL']).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2235
C3000     1918
C2100     1883
C1370        1
C1236        1
C2190        1
C4200        1
C5200        1
C1248        1
C2600        1
C1580        1
C6100        1
C1820        1
C1900        1
C2570        1

## Optimization 1
- Increasing the number of neurons in each layer
- Adding an additional layer using Relu

In [17]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=150, activation='relu', input_dim=input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=150, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=125, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='weights.{epoch:02d}.weights.h5', save_weights_only=True, save_freq=5*len(X_train_scaled))
history = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[checkpoint_callback])


Epoch 1/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 939us/step - accuracy: 0.7158 - loss: 0.5760
Epoch 2/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 907us/step - accuracy: 0.7280 - loss: 0.5567
Epoch 3/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 934us/step - accuracy: 0.7328 - loss: 0.5495
Epoch 4/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 889us/step - accuracy: 0.7338 - loss: 0.5445
Epoch 5/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 921us/step - accuracy: 0.7313 - loss: 0.5480
Epoch 6/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 894us/step - accuracy: 0.7360 - loss: 0.5462
Epoch 7/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 877us/step - accuracy: 0.7364 - loss: 0.5457
Epoch 8/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 851us/step - accuracy: 0.7380 - loss: 0.5406
Epoch 9/100
[1m

In [19]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

nn.save("AlphabetSoupCharity_Optimization_1.h5")

if model_accuracy < 0.75:
    print("FAIL")
else:
    print("PASS")

215/215 - 0s - 764us/step - accuracy: 0.7270 - loss: 0.6074




Loss: 0.6074185371398926, Accuracy: 0.7269679307937622
FAIL


## Optimization 2
- Dropping more columns from dataset + Optimization 1

In [20]:
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")

# Drop the 'EIN' and 'NAME' columns
application_df = application_df.drop(columns=['EIN', 'NAME', 'AFFILIATION','USE_CASE','ORGANIZATION'])

# Get the value counts of CLASSIFICATION
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Display the value counts greater than 1
classification_counts_gt1 = classification_counts[classification_counts > 1]
print(classification_counts_gt1)

# Choose a cutoff value (e.g., any value count less than 1000)
cutoff_value = 1000

# Create a list of classifications to be replaced
classifications_to_replace = classification_counts_gt1[classification_counts_gt1 < cutoff_value].index.tolist()

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls, "Other")

# Check to make sure replacement was successful
print(application_df['CLASSIFICATION'].value_counts())

# Convert categorical data to numeric with pd.get_dummies
application_df = pd.get_dummies(application_df)

# Define the target (y) and features (X)
y = application_df['IS_SUCCESSFUL'].values
X = application_df.drop(columns=['IS_SUCCESSFUL']).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2235
C3000     1918
C2100     1883
C1370        1
C1236        1
C2190        1
C4200        1
C5200        1
C1248        1
C2600        1
C1580        1
C6100        1
C1820        1
C1900        1
C2570        1

In [21]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=150, activation='relu', input_dim=input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=150, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=125, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='weights.{epoch:02d}.weights.h5', save_weights_only=True, save_freq=5*len(X_train_scaled))
history = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[checkpoint_callback])

Epoch 1/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 914us/step - accuracy: 0.6165 - loss: 0.6513
Epoch 2/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 959us/step - accuracy: 0.6203 - loss: 0.6391
Epoch 3/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 888us/step - accuracy: 0.6241 - loss: 0.6377
Epoch 4/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 961us/step - accuracy: 0.6266 - loss: 0.6351
Epoch 5/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 893us/step - accuracy: 0.6275 - loss: 0.6350
Epoch 6/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 873us/step - accuracy: 0.6257 - loss: 0.6353
Epoch 7/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 860us/step - accuracy: 0.6249 - loss: 0.6342
Epoch 8/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 866us/step - accuracy: 0.6255 - loss: 0.6330
Epoch 9/100
[1m

In [23]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

nn.save("AlphabetSoupCharity_Optimization_2.h5")

if model_accuracy < 0.75:
    print("FAIL")
else:
    print("PASS")

215/215 - 0s - 1ms/step - accuracy: 0.6201 - loss: 0.6472




Loss: 0.6472188830375671, Accuracy: 0.6201165914535522
FAIL


## Optimization 3 
- Optimization 1 + adjusting testing split ratio & decreasing neurons in each layer.

In [24]:
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")

# Drop the 'EIN' and 'NAME' columns
application_df = application_df.drop(columns=['EIN', 'NAME'])

# Get the value counts of CLASSIFICATION
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Display the value counts greater than 1
classification_counts_gt1 = classification_counts[classification_counts > 1]
print(classification_counts_gt1)

# Choose a cutoff value (e.g., any value count less than 1000)
cutoff_value = 1000

# Create a list of classifications to be replaced
classifications_to_replace = classification_counts_gt1[classification_counts_gt1 < cutoff_value].index.tolist()

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls, "Other")

# Check to make sure replacement was successful
print(application_df['CLASSIFICATION'].value_counts())

# Convert categorical data to numeric with pd.get_dummies
application_df = pd.get_dummies(application_df)

# Define the target (y) and features (X)
y = application_df['IS_SUCCESSFUL'].values
X = application_df.drop(columns=['IS_SUCCESSFUL']).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2235
C3000     1918
C2100     1883
C1370        1
C1236        1
C2190        1
C4200        1
C5200        1
C1248        1
C2600        1
C1580        1
C6100        1
C1820        1
C1900        1
C2570        1

In [25]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = X_train_scaled.shape[1]

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=8, activation='relu', input_dim=input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=8, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=8, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath='weights.{epoch:02d}.weights.h5', save_weights_only=True, save_freq=5*len(X_train_scaled))
history = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[checkpoint_callback])

Epoch 1/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 634us/step - accuracy: 0.6395 - loss: 0.6603
Epoch 2/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 487us/step - accuracy: 0.7257 - loss: 0.5790
Epoch 3/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 467us/step - accuracy: 0.7306 - loss: 0.5623
Epoch 4/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/step - accuracy: 0.7335 - loss: 0.5544
Epoch 5/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 569us/step - accuracy: 0.7330 - loss: 0.5525
Epoch 6/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 504us/step - accuracy: 0.7304 - loss: 0.5538
Epoch 7/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 561us/step - accuracy: 0.7348 - loss: 0.5491
Epoch 8/100
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 581us/step - accuracy: 0.7328 - loss: 0.5540
Epoch 9/100
[1m

In [27]:
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

nn.save("AlphabetSoupCharity_Optimization_3.h5")

if model_accuracy < 0.75:
    print("FAIL")
else:
    print("PASS")

108/108 - 0s - 1ms/step - accuracy: 0.7204 - loss: 0.5664




Loss: 0.5663861632347107, Accuracy: 0.7204081416130066
FAIL
