## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import sklearn as skl
import pandas as pd
import tensorflow as tf


In [None]:

#  Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(['EIN','NAME'], axis=1, inplace=True)
application_df.head()

In [None]:
# Determine the number of unique values in each column.
unique_val = application_df.nunique()
unique_val

In [None]:
def apply_cutoff_to_column(df: pd.DataFrame, column: str, cutoff_value: int, new_val_name: str="Other") -> pd.DataFrame:
    """Replaces (Bins) values in the `column` where the
     count is < `cutoff_value` with `new_val_name`.
    """
    _cols_unique = df[column].unique()
    _cols_value_counts = df[column].value_counts()
    _vals_to_replace = [x for x in _cols_unique if _cols_value_counts[x] < cutoff_value]
    print(f"Replace with '{new_val_name}': {_vals_to_replace}")

    # Replace in DataFrame
    for app in _vals_to_replace:
        df[column] = df[column].replace(app, new_val_name)

    return df

In [None]:
# Look at APPLICATION_TYPE value counts for binning
v_counts = application_df['APPLICATION_TYPE'].value_counts()
v_counts

In [None]:
# Choose a cutoff value and create a list of application types to be replaced
# Check to make sure binning was successfull
app_df_cutoffs = apply_cutoff_to_column(application_df, "APPLICATION_TYPE", 500)
app_df_cutoffs['APPLICATION_TYPE'].value_counts()

In [None]:
# CLASSIFICATION cutoff = 700
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "CLASSIFICATION", 700)
app_df_cutoffs['CLASSIFICATION'].value_counts()

In [None]:
display(application_df["AFFILIATION"].value_counts())
display(application_df["USE_CASE"].value_counts())
display(application_df["ORGANIZATION"].value_counts())
display(application_df["INCOME_AMT"].value_counts())
application_df["SPECIAL_CONSIDERATIONS"].value_counts()

In [None]:
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "INCOME_AMT", 500, "5M+")
app_df_cutoffs['INCOME_AMT'].value_counts()

In [None]:
# AFFILIATION cutoff = 1000
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "AFFILIATION", 1000)
app_df_cutoffs['AFFILIATION'].value_counts()

In [None]:
# Drop SPECIAL_CONSIDERATIONS
# Instead of making it a dummy, it could be very broad and is only true for a tiny percentage (<.1%) of organisations.
application_df.drop("SPECIAL_CONSIDERATIONS", axis=1, inplace=True)

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
application_dummies = pd.get_dummies(application_df, columns=["AFFILIATION", "USE_CASE", "ORGANIZATION", "INCOME_AMT",
                                                       "CLASSIFICATION", "APPLICATION_TYPE"])
application_dummies.head()

In [None]:
# Split our preprocessed data into our features and target arrays
X = application_dummies.drop('IS_SUCCESSFUL', axis=1).values
y = application_dummies['IS_SUCCESSFUL'].values

# Split the preprocessed data into a training and testing dataset
#  YOUR CODE GOES HERE
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

## Attempt 1
Only the input data/number of features has changed

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_features = X_train.shape[1]
print("Features:", num_features)

nn = tf.keras.models.Sequential()

# number of neurons should be 2-3 times number of inputs=number of features
# there is only 1 output
nn.add(tf.keras.layers.Dense(units=num_features*2, activation="relu", input_dim=num_features))
nn.add(tf.keras.layers.Dense(units=num_features*2, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
# save model
nn.save("models/AlphabetSoupCharity_Optimisation_1.h5")
# plotting the accuracy
history_df = pd.DataFrame(fit_model.history, index = range(1, len(fit_model.history['loss'])+1))
history_df.plot(y = 'accuracy')

## Atempt 2
Try using few neurons (< number of features) with sigmoid for non input layer

In [None]:
NUM_FEATURES = X_train.shape[1]
print("Features:", NUM_FEATURES)

nn2 = tf.keras.models.Sequential()

nn2.add(tf.keras.layers.Dense(units=10, activation="relu", input_dim=NUM_FEATURES))
nn2.add(tf.keras.layers.Dense(units=8, activation="sigmoid"))
nn2.add(tf.keras.layers.Dense(units=6, activation="sigmoid"))
nn2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

display(nn2.summary())
nn2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model2 = nn2.fit(X_train_scaled, y_train, epochs=50)

In [None]:
# Evaluate the model using the test data
model_loss2, model_accuracy2 = nn2.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss2}, Accuracy: {model_accuracy2}")
# save model
nn2.save("models/AlphabetSoupCharity_Optimisation_2.h5")
# plotting the accuracy
history_df = pd.DataFrame(fit_model2.history, index = range(1, len(fit_model2.history['loss'])+1))
history_df.plot(y = 'accuracy')

## Attempt 3
Many more internal layers

In [None]:
NUM_FEATURES = X_train.shape[1]
print("Features:", NUM_FEATURES)

nn3 = tf.keras.models.Sequential()

nn3.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu", input_dim=NUM_FEATURES))
nn3.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn3.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn3.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn3.add(tf.keras.layers.Dense(units=NUM_FEATURES*2, activation="relu"))
nn3.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

display(nn3.summary())
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model3 = nn3.fit(X_train_scaled, y_train, epochs=50)

In [None]:
# Evaluate the model using the test data
model_loss3, model_accuracy3 = nn3.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss3}, Accuracy: {model_accuracy3}")
# save model
nn3.save("models/AlphabetSoupCharity_Optimisation_3.h5")
# plotting the accuracy
history_df = pd.DataFrame(fit_model3.history, index = range(1, len(fit_model3.history['loss'])+1))
history_df.plot(y = 'accuracy')

## Attempt 4 - Keras Tuner
- Allow activation function to choose between `relu`, `sigmoid`, `tanh`
    - `sigmoid` is still the only option for the final layer
- Allow number of neurons to vary from 6 to ~75
- Allow for 1, 2, or 3 extra hidden layers (not input or output)

In [None]:
import keras_tuner as kt
NUM_FEATURES = X_train.shape[1]
print("Features:", NUM_FEATURES)

def create_model(hp: kt.HyperParameters) -> tf.keras.models.Sequential:
    model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice("activation", ["relu", "sigmoid", "tanh"])

    # Allow kerastuner to decide number of neurons in first layer
    model.add(tf.keras.layers.Dense(units=hp.Int("first_units", 6, NUM_FEATURES*2, step=6),
                                    activation=activation,
                                    input_dim=NUM_FEATURES))
    
    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int("num_layers", 1, 3)):
        model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            6, NUM_FEATURES*2, step=6), activation=activation))
    # output layer
    model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    model.compile(loss="binary_crossentropy", optimizer='adam',
                  metrics=["accuracy"])
    return model

tuner = kt.Hyperband(create_model, objective="val_accuracy",
                     max_epochs=25, overwrite=True,
                     hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=25,
             validation_data=(X_test_scaled, y_test))

In [None]:
# Get top 3 model hyperparameters and print the values
top_4_hp = tuner.get_best_hyperparameters(3)
for hparam in top_4_hp:
    print(hparam.values)

# Evaluate the top 3 models against the test dataset
top_4_model = tuner.get_best_models(3)
for model in top_4_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
best_tuned_model = tuner.get_best_models(1)[0]
fit_model4 = best_tuned_model.fit(X_train_scaled, y_train, epochs=50)

In [None]:

# Evaluate the model using the test data
model_loss4, model_accuracy4 = best_tuned_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss4}, Accuracy: {model_accuracy4}")
# save model
best_tuned_model.save("models/AlphabetSoupCharity_Optimisation_4.h5")
# plotting the accuracy
history_df = pd.DataFrame(fit_model4.history, index = range(1, len(fit_model4.history['loss'])+1))
history_df.plot(y = 'accuracy')

## Overall Results

None of the 4 different attempts at improving accuracy succeeded in passing 75%.

All four settled around 72.5-72.8% which is not better than the original 72.8% found in `AlphabetSoupCharity.ipynb`.

# Attempt 5 - Bin Names

Final attempt by preserving `NAME` column.

This column could be used to classify already known organizations that have a history with our nonprofit and receiving funding.

In [None]:
# Apply changes from start of notebook onto new instance of data
application_df_5 = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
# Drop 'EIN' column only
application_df_5.drop("EIN", axis=1, inplace=True)
# APPLICATION_TYPE cutoff = 500
application_df_5 = apply_cutoff_to_column(application_df_5, "APPLICATION_TYPE", 500)
# CLASSIFICATION cutoff = 500
application_df_5 = apply_cutoff_to_column(application_df_5, "CLASSIFICATION", 500)
# INCOME_AMT cutoff = 500
application_df_5 = apply_cutoff_to_column(application_df_5, "INCOME_AMT", 500, "5M+")
# AFFILIATION cutoff = 1000
application_df_5 = apply_cutoff_to_column(application_df_5, "AFFILIATION", 1000)
# Drop SPECIAL_CONSIDERATIONS
application_df_5.drop("SPECIAL_CONSIDERATIONS", axis=1, inplace=True)

# Apply cutoff to NAME column at 200
# NAME could be used to identify already good organizations
application_df_5 = apply_cutoff_to_column(application_df_5, "NAME", 100)
application_df_5["NAME"].nunique()

In [None]:
# Create dummies
application_df_5 = pd.get_dummies(application_df_5, columns=["AFFILIATION", "USE_CASE", "ORGANIZATION", "INCOME_AMT",
                                               "CLASSIFICATION", "APPLICATION_TYPE", "NAME"])
print("New Application df:", application_df_5.shape)

# Split our preprocessed data into our features and target arrays
X2 = application_df_5.drop("IS_SUCCESSFUL", axis=1).values
y2 = application_df_5["IS_SUCCESSFUL"].values

# Split the preprocessed data into a training and testing dataset
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42, stratify=y2)

# Create a StandardScaler instances
# Fit the StandardScaler
scaler2 = StandardScaler().fit(X2_train)

# Scale the data
X2_train_scaled = scaler2.transform(X2_train)
X2_test_scaled = scaler2.transform(X2_test)

X2_train.shape

In [None]:
# Create model
num_features = X2_train.shape[1]
print("Features:", num_features)

nn5 = tf.keras.models.Sequential()

# number of neurons should be 2-3 times number of inputs=number of features
# there is only 1 output
nn5.add(tf.keras.layers.Dense(units=num_features*2, activation="relu", input_dim=num_features))
nn5.add(tf.keras.layers.Dense(units=num_features*2, activation="relu"))
nn5.add(tf.keras.layers.Dense(units=num_features, activation="relu"))
nn5.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

display(nn5.summary())

# Compile the model
nn5.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model5 = nn5.fit(X2_train_scaled, y2_train, epochs=50)

In [None]:
# Evaluate the model using the test data
model_loss5, model_accuracy5 = nn5.evaluate(X2_test_scaled, y2_test, verbose=2)
print(f"Loss: {model_loss5}, Accuracy: {model_accuracy5}")
# save model
nn5.save("models/AlphabetSoupCharity_Optimisation_5.h5")
# plotting the accuracy
history_df = pd.DataFrame(fit_model5.history, index = range(1, len(fit_model5.history['loss'])+1))
history_df.plot(y = 'accuracy')

This model has passed 75% accuracy.