## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import sklearn as skl
import pandas as pd
import tensorflow as tf


In [None]:

#  Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(['EIN','NAME'], axis=1, inplace=True)
application_df.head()

In [None]:
# Determine the number of unique values in each column.
unique_val = application_df.nunique()
unique_val

In [None]:
def apply_cutoff_to_column(df: pd.DataFrame, column: str, cutoff_value: int, new_val_name: str="Other") -> pd.DataFrame:
    """Replaces (Bins) values in the `column` where the
     count is < `cutoff_value` with `new_val_name`.
    """
    _cols_unique = df[column].unique()
    _cols_value_counts = df[column].value_counts()
    _vals_to_replace = [x for x in _cols_unique if _cols_value_counts[x] < cutoff_value]
    print(f"Replace with '{new_val_name}': {_vals_to_replace}")

    # Replace in DataFrame
    for app in _vals_to_replace:
        df[column] = df[column].replace(app, new_val_name)

    return df

In [None]:
# Look at APPLICATION_TYPE value counts for binning
v_counts = application_df['APPLICATION_TYPE'].value_counts()
v_counts

In [None]:
# Choose a cutoff value and create a list of application types to be replaced
# Check to make sure binning was successfull
app_df_cutoffs = apply_cutoff_to_column(application_df, "APPLICATION_TYPE", 500)
app_df_cutoffs['APPLICATION_TYPE'].value_counts()

In [None]:
# CLASSIFICATION cutoff = 700
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "CLASSIFICATION", 700)
app_df_cutoffs['CLASSIFICATION'].value_counts()

In [None]:
display(application_df["AFFILIATION"].value_counts())
display(application_df["USE_CASE"].value_counts())
display(application_df["ORGANIZATION"].value_counts())
display(application_df["INCOME_AMT"].value_counts())
application_df["SPECIAL_CONSIDERATIONS"].value_counts()

In [None]:
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "INCOME_AMT", 500, "5M+")
app_df_cutoffs['INCOME_AMT'].value_counts()

In [None]:
# AFFILIATION cutoff = 1000
app_df_cutoffs = apply_cutoff_to_column(app_df_cutoffs, "AFFILIATION", 1000)
app_df_cutoffs['AFFILIATION'].value_counts()

In [None]:
# Drop SPECIAL_CONSIDERATIONS
# Instead of making it a dummy, it could be very broad and is only true for a tiny percentage (<.1%) of organisations.
application_df.drop("SPECIAL_CONSIDERATIONS", axis=1, inplace=True)

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
application_dummies = pd.get_dummies(application_df, columns=["AFFILIATION", "USE_CASE", "ORGANIZATION", "INCOME_AMT",
                                                       "CLASSIFICATION", "APPLICATION_TYPE"])
application_dummies.head()

In [None]:
# Split our preprocessed data into our features and target arrays
X = application_dummies.drop('IS_SUCCESSFUL', axis=1).values
y = application_dummies['IS_SUCCESSFUL'].values

# Split the preprocessed data into a training and testing dataset
#  YOUR CODE GOES HERE
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify=y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

Attempt 1

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
num_features = X_train.shape[1]
print("Features:", num_features)

nn = tf.keras.models.Sequential()

# number of neurons should be 2-3 times number of inputs=number of features
# there is only 1 output
nn.add(tf.keras.layers.Dense(units=num_features*2, activation="relu", input_dim=num_features))
nn.add(tf.keras.layers.Dense(units=num_features*2, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
# plotting the accuracy
history_df = pd.DataFrame(fit_model.history, index = range(1, len(fit_model.history['loss'])+1))
history_df.plot(y = 'accuracy')

Atempt 2