In [1]:
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
#%matplotlib inline

In [3]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import os
import keras_tuner as kt


### Define functions for use during optimization

In [4]:
def reduce_count_vals(df, colname, threshold):
    # Determine which values to replace if counts are less than ...?
    counts = df[colname].value_counts()
    replace_list = list(counts[counts < threshold].index)

    # Replace in dataframe
    for item in replace_list:
       df[colname] = df[colname].replace(item,"Other")
    
    # Check to make sure binning was successful
    df[colname].value_counts()

In [5]:
def do_one_hot(df, y_col):
    # Create a list of columns that are 'object' type
    obj_cat = df.dtypes[df.dtypes == "object"].index.tolist()
    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)
    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df = pd.DataFrame(enc.fit_transform(df[obj_cat]))
    # Add the encoded variable names to the DataFrame
    encode_df.columns = enc.get_feature_names(obj_cat)
    
    # Merge one-hot encoded features and drop the originals
    df = df.merge(encode_df, left_index=True, right_index=True)
    df = df.drop(obj_cat,1)
    
    # Split our preprocessed data into our features and target arrays
    y = df[y_col].values
    df.drop(columns=[y_col], inplace=True)
    X = df.values
    print(f"merged df.shape()={df.shape}")
    
    return df, X, y

In [6]:
def do_scatter_plots(df):
    colnames = ['APPLICATION_TYPE', 
        'AFFILIATION', 
        'CLASSIFICATION', 
        'USE_CASE', 
        'ORGANIZATION', 
        'INCOME_AMT',
        'SPECIAL_CONSIDERATIONS', 
        'ASK_AMT',
    ]
    #fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(24,24))
    for xx in range(0,8):
        for yy in range(xx+1,8):
            #if xx == yy:
            #    continue
            titlestring = f"x={colnames[xx]} vs. y={colnames[yy]}"
            #df.plot.scatter(ax=axes[xx,yy], x=colnames[xx], y=colnames[yy], title=titlestring, c=df['IS_SUCCESSFUL'], s=(df['ASK_AMT']*20), colormap='winter')
            df.plot.scatter(
                figsize=(6,6),
                x=colnames[xx], 
                y=colnames[yy], 
                title=titlestring,
                xlabel=colnames[xx],
                ylabel=colnames[yy],
                c=df['IS_SUCCESSFUL'], 
                s=(df['ASK_AMT']*50), 
                colormap='winter'
            )


In [7]:
def do_scale(scaler):
    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    return X_train_scaled, X_test_scaled

In [8]:
def build_model(inputs, layers):
    nn = tf.keras.models.Sequential()
    first = True
    for layer in layers:
        if first:
            first = False
            nn.add(tf.keras.layers.Dense(units=layer['units'], activation=layer['act'], input_dim=inputs))
        else:
            nn.add(tf.keras.layers.Dense(units=layer['units'], activation=layer['act']))

    print(nn.summary())
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return nn

In [9]:
def train_nn_model(model, X_train, y_train, n_epochs, checkpoint_dir):
    # Create a callback that saves the model's weights every epoch
    from tensorflow.keras.callbacks import ModelCheckpoint
    
    # Define the checkpoint path and filenames
    os.makedirs("checkpoints_opt",exist_ok=True)
    #checkpoint_file = f"weights.{epoch:02d}.hdf5"
    #checkpoint_path = f"checkpoints_opt/weights.{epoch:02d}.hdf5"
    checkpoint_path = "checkpoints_opt/weights_2.{epoch:02d}.hdf5"
    
    cp_callback_opt = ModelCheckpoint(
        filepath=checkpoint_path,
        verbose=1,
        save_weights_only=True,
        period=5)

    # Normally we use 'save_freq', but it behaves strangely, and I could not get
    # it to save every 5 epochs. The 'period' param is now deprecated, but it works.
        #save_freq='epoch')
        
    fit_model = model.fit(X_train, y_train, epochs=n_epochs, callbacks=[cp_callback_opt])
    return fit_model

In [10]:
def run_keras_tuner(X_train, y_train, n_epochs, validation_data):
    # Create a `Hyperband()` tuner instance
    tuner = kt.Hyperband(
        create_tuner_model,
        objective="val_accuracy",
        max_epochs=50,
        hyperband_iterations=2,
        overwrite=True)

    # Run the kerastuner search for best hyperparameters
    tuner.search(X_train, y_train, epochs=n_epochs, validation_data=validation_data)
    return tuner

In [11]:
def range_to_int(range_str):
    if range_str == "0":
        return 0
    elif range_str == "1-9999":
        return 9_999 
    elif range_str == "10000-24999":
        return 24_999 
    elif range_str == "25000-99999":
        return 99_999 
    elif range_str == "100000-499999":
        return 499_999 
    elif range_str == "500000-1000000":
        return 1_000_000 
    elif range_str == "500000-1M":
        return 1_000_000 
    elif range_str == "1M-5M":
        return 5_000_000
    elif range_str == "5M-10M":
        return 10_000_000 
    elif range_str == "10M-50M":
        return 50_000_000
    elif range_str == "50M+":
        return 100_000_000 
    else:
        return 11


In [12]:
def chunk_ask(ask):
    if ask == 0:
        return "0"
    elif ask < 9999:
        return "1-9999"
    elif ask < 24999:
        return "10000-24999"
    elif ask < 99999:
        return "25000-99999"
    elif ask < 499999:
        return "100000-499999"
    elif ask < 1000000:
        return "500000-1000000"
    elif ask < 5000000:
        return "1M-5M"
    elif ask < 10000000:
        return "5M-10M"
    elif ask < 50000000:
        return "10M-50M"
    else:
        return "50M+"


In [13]:
def encode_ask(ask):
    if ask < 9999:
        return 1 #return 9_999 // 5000
    elif ask < 24999:
        return 2 #return 24_999 // 5000
    elif ask < 99999:
        return 3 #return 99_999 // 5000
    elif ask < 499999:
        return 4 #return 499_999 // 5000
    elif ask < 1000000:
        return 5 #return 1_000_000 // 5000
    elif ask < 5000000:
        return 6 #return 5_000_000 // 5000
    elif ask < 10000000:
        return 7 #return 10_000_000 // 5000
    elif ask < 50000000:
        return 8 #return 50_000_000 // 5000
    else:
        return 9 #return 100_000_000 // 5000


### More fun

In [22]:
application_df = pd.read_csv("Resources/charity_data.csv")
csv_cols = [
    'EIN', 
    'NAME', 
    'APPLICATION_TYPE', 
    'AFFILIATION', 
    'CLASSIFICATION', 
    'USE_CASE', 
    'ORGANIZATION', 
    'STATUS', 
    'INCOME_AMT', 
    'SPECIAL_CONSIDERATIONS', 
    'ASK_AMT', 
    'IS_SUCCESSFUL'
]
application_df.drop(columns=[
    'EIN', 'NAME','INCOME_AMT','SPECIAL_CONSIDERATIONS','STATUS'
    ], inplace=True)

reduce_count_vals(application_df, 'APPLICATION_TYPE', 500)
reduce_count_vals(application_df, 'CLASSIFICATION', 1500)

#application_df["INCOME_AMT"] = application_df["INCOME_AMT"].apply(range_to_int)
#application_df["ASK_AMT"] = application_df["ASK_AMT"].apply(encode_ask)
application_df["ASK_AMT"] = application_df["ASK_AMT"].apply(chunk_ask)

application_df, X, y = do_one_hot(application_df, 'IS_SUCCESSFUL')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
X_train_scaled, X_test_scaled = do_scale(StandardScaler())
#X_train_scaled, X_test_scaled = do_scale(MinMaxScaler())

nins = application_df.shape[1]
print(f"nins={nins}")
nn_model = build_model(inputs=nins, layers=[
    {'units': 340, 'act': 'tanh'},
    {'units': 340, 'act': 'tanh'},
    {'units': 170, 'act': 'tanh'},
    {'units': 170, 'act': 'tanh'},
    {'units': 85,  'act': 'tanh'},
    {'units': 40,  'act': 'relu'},
    {'units': 40,  'act': 'relu'},
    {'units': 40,  'act': 'relu'},
    {'units': 40,  'act': 'relu'},
    {'units': 40,  'act': 'relu'},
    {'units': 40,  'act': 'relu'},
    {'units': 20,  'act': 'sigmoid'},
    {'units': 5,   'act': 'sigmoid'},
    {'units': 1,   'act': 'sigmoid'}
])
trained_model = train_nn_model(nn_model, X_train_scaled, y_train, n_epochs=1500, checkpoint_dir="checkpoints_opt")

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Export our model to HDF5 file
nn_model.save("AlphabetSoupCharity_opt4.h5")

  del sys.path[0]


merged df.shape()=(34299, 39)
nins=39
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_36 (Dense)             (None, 340)               13600     
_________________________________________________________________
dense_37 (Dense)             (None, 340)               115940    
_________________________________________________________________
dense_38 (Dense)             (None, 170)               57970     
_________________________________________________________________
dense_39 (Dense)             (None, 170)               29070     
_________________________________________________________________
dense_40 (Dense)             (None, 85)                14535     
_________________________________________________________________
dense_41 (Dense)             (None, 40)                3440      
_________________________________________________________________
dense_42 (Dense)

Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500

Epoch 00055: saving model to checkpoints_opt\weights_2.55.hdf5
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500

Epoch 00060: saving model to checkpoints_opt\weights_2.60.hdf5
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500

Epoch 00065: saving model to checkpoints_opt\weights_2.65.hdf5
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500

Epoch 00070: saving model to checkpoints_opt\weights_2.70.hdf5
Epoch 71/1500
Epoch 72/1500
Epoch 73/1500
Epoch 74/1500
Epoch 75/1500

Epoch 00075: saving model to checkpoints_opt\weights_2.75.hdf5
Epoch 76/1500
Epoch 77/1500
Epoch 78/1500
Epoch 79/1500
Epoch 80/1500

Epoch 00080: saving model to checkpoints_opt\weights_2.80.hdf5
Epoch 81/1500
Epoch 82/1500
Epoch 83/1500
Epoch 84/1500
Epoch 85/1500

Epoch 00085: saving model to checkpoints_opt\weights_2.85.hdf5
Epoch 86/1500
Epoch 87/1500
Epoch 88/1500
Epoch 89/1500
Epoch 90/1500

Epoch

KeyboardInterrupt: 

In [None]:
#%matplotlib inline
#application_df  = pd.read_csv("Resources/charity_data.csv")
#application_df.drop(columns=['EIN', 'NAME'], inplace=True)
#do_scatter_plots(application_df)

In [17]:
application_df = pd.read_csv("Resources/charity_data.csv")
application_df.dtypes

EIN                        int64
NAME                      object
APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [18]:
application_df.nunique()

EIN                       34299
NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

In [19]:
application_df["ASK_AMT"] = application_df["ASK_AMT"].apply(chunk_ask)
application_df.nunique()

EIN                       34299
NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                       9
IS_SUCCESSFUL                 2
dtype: int64

In [20]:
reduce_count_vals(application_df, 'APPLICATION_TYPE', 500)
reduce_count_vals(application_df, 'CLASSIFICATION', 1500)
application_df.nunique()

EIN                       34299
NAME                      19568
APPLICATION_TYPE              9
AFFILIATION                   6
CLASSIFICATION                6
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                       9
IS_SUCCESSFUL                 2
dtype: int64