<a href="https://colab.research.google.com/github/jasonstonemed/deep-learning-challenge/blob/main/Starter_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv").sample(n=25000, random_state=42)

application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
7346,271598055,ASSOCIATION OF ST AUGUSTINE ALUMNI - ATLANTA INC,T3,Independent,C1000,Preservation,Trust,1,100000-499999,N,50087,1
30868,900109768,AMERICAN SOCIETY OF ADDICTION MEDICINE INC,T3,CompanySponsored,C1000,Preservation,Trust,1,0,N,5000,1
10670,352562499,TOPS CLUB INC,T3,CompanySponsored,C1200,Preservation,Association,1,0,N,5000,0
22408,611766879,TOPS CLUB INC,T3,CompanySponsored,C1200,Preservation,Association,1,0,N,5000,0
119,10874048,INTERNATIONAL RAFTING FEDERATION,T4,Independent,C1000,ProductDev,Trust,1,0,N,5000,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(columns= ['EIN', 'NAME'])


Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
7346,T3,Independent,C1000,Preservation,Trust,1,100000-499999,N,50087,1
30868,T3,CompanySponsored,C1000,Preservation,Trust,1,0,N,5000,1
10670,T3,CompanySponsored,C1200,Preservation,Association,1,0,N,5000,0
22408,T3,CompanySponsored,C1200,Preservation,Association,1,0,N,5000,0
119,T4,Independent,C1000,ProductDev,Trust,1,0,N,5000,1
...,...,...,...,...,...,...,...,...,...,...
3675,T8,CompanySponsored,C1000,Preservation,Trust,1,0,N,5000,0
15450,T3,Independent,C2000,Preservation,Trust,1,0,N,5000,1
20608,T3,CompanySponsored,C1000,Preservation,Association,1,100000-499999,N,19585,0
17348,T3,CompanySponsored,C2100,Preservation,Trust,1,0,N,5000,0


In [3]:
# Determine the number of unique values in each column.
application_df.nunique()

EIN                       25000
NAME                      14531
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               61
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    6400
IS_SUCCESSFUL                 2
dtype: int64

In [4]:
# Look at APPLICATION_TYPE value counts for binning
application_counts = application_df.APPLICATION_TYPE.value_counts()
application_counts


T3     19718
T4      1122
T6       896
T5       866
T19      760
T8       531
T7       515
T10      389
T9       113
T13       47
T12       19
T2        15
T25        3
T14        2
T29        2
T15        1
T17        1
Name: APPLICATION_TYPE, dtype: int64

In [5]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_counts[application_counts < 500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

T3       19718
T4        1122
T6         896
T5         866
T19        760
Other      592
T8         531
T7         515
Name: APPLICATION_TYPE, dtype: int64

In [6]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df.CLASSIFICATION.value_counts()
classification_counts


C1000    12671
C2000     4404
C1200     3512
C3000     1398
C2100     1357
         ...  
C8210        1
C1900        1
C2190        1
C2600        1
C1245        1
Name: CLASSIFICATION, Length: 61, dtype: int64

In [7]:
# You may find it helpful to look at CLASSIFICATION value counts >1
classification_counts[classification_counts > 1]



C1000    12671
C2000     4404
C1200     3512
C3000     1398
C2100     1357
C7000      569
C1700      200
C4000      133
C1270       92
C5000       87
C2700       82
C2800       71
C7100       57
C1300       44
C1280       33
C1230       29
C1400       26
C7200       25
C2300       23
C1240       21
C8000       16
C7120       15
C1500       14
C6000       13
C1250       10
C1235        9
C8200        9
C1237        8
C1800        8
C1238        8
C1278        5
C1257        5
C4100        5
C2400        4
C7210        3
C1260        3
C1256        2
C1246        2
C1234        2
C1600        2
C0           2
C1720        2
Name: CLASSIFICATION, dtype: int64

In [8]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

C1000    12671
C2000     4404
C1200     3512
Other     1658
C3000     1398
C2100     1357
Name: CLASSIFICATION, dtype: int64

In [9]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df)
application_df




Unnamed: 0,EIN,STATUS,ASK_AMT,IS_SUCCESSFUL,NAME_1 DAY RANCH RESCUE AND RURAL OKLAHOMA ANIMAL RESOURCE INC,NAME_100 BLACK MEN OF MEMPHIS INC,NAME_100 BLACK MEN OF WEST GEORGIA INC,NAME_13TH BOMB SQUADRON ASSOCIATION,NAME_14TH ARMORED DIVISION,NAME_150TH INTELLIGENCE SQUADRON ENLISTED ADVISORY COUNCIL,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
7346,271598055,1,50087,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
30868,900109768,1,5000,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10670,352562499,1,5000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
22408,611766879,1,5000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
119,10874048,1,5000,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,237214594,1,5000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
15450,462659357,1,5000,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20608,521242195,1,19585,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
17348,470985271,1,5000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
# Split our preprocessed data into our features and target arrays
y = application_df['IS_SUCCESSFUL'].values
X = application_df.drop(columns=['IS_SUCCESSFUL']).values

# Split the preprocessed data into a training and testing dataset
X_test, X_train, y_test, y_train = train_test_split(X,y, random_state=78)


In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [12]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=14574))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5)                 72875     
                                                                 
 dense_1 (Dense)             (None, 5)                 30        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 72,911
Trainable params: 72,911
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [14]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=81)


Epoch 1/81
Epoch 2/81
Epoch 3/81
Epoch 4/81
Epoch 5/81
Epoch 6/81
Epoch 7/81
Epoch 8/81
Epoch 9/81
Epoch 10/81
Epoch 11/81
Epoch 12/81
Epoch 13/81
Epoch 14/81
Epoch 15/81
Epoch 16/81
Epoch 17/81
Epoch 18/81
Epoch 19/81
Epoch 20/81
Epoch 21/81
Epoch 22/81
Epoch 23/81
Epoch 24/81
Epoch 25/81
Epoch 26/81
Epoch 27/81
Epoch 28/81
Epoch 29/81
Epoch 30/81
Epoch 31/81
Epoch 32/81
Epoch 33/81
Epoch 34/81
Epoch 35/81
Epoch 36/81
Epoch 37/81
Epoch 38/81
Epoch 39/81
Epoch 40/81
Epoch 41/81
Epoch 42/81
Epoch 43/81
Epoch 44/81
Epoch 45/81
Epoch 46/81
Epoch 47/81
Epoch 48/81
Epoch 49/81
Epoch 50/81
Epoch 51/81
Epoch 52/81
Epoch 53/81
Epoch 54/81
Epoch 55/81
Epoch 56/81
Epoch 57/81
Epoch 58/81
Epoch 59/81
Epoch 60/81
Epoch 61/81
Epoch 62/81
Epoch 63/81
Epoch 64/81
Epoch 65/81
Epoch 66/81
Epoch 67/81
Epoch 68/81
Epoch 69/81
Epoch 70/81
Epoch 71/81
Epoch 72/81
Epoch 73/81
Epoch 74/81
Epoch 75/81
Epoch 76/81
Epoch 77/81
Epoch 78/81
Epoch 79/81
Epoch 80/81
Epoch 81/81


In [15]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

586/586 - 2s - loss: 0.6581 - accuracy: 0.7674 - 2s/epoch - 4ms/step
Loss: 0.6580985188484192, Accuracy: 0.7673599720001221


In [16]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity.h5")
