## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
df_raw = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
df_raw.head(10)

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
5,10556855,MINORITY ORGAN & TISSUE TRANSPLANT & EDUCATION...,T3,Independent,C1200,Preservation,Trust,1,0,N,5000,1
6,10558440,FRIENDS OF ARTS COUNCIL OF GREATER DENHAM SPRI...,T3,Independent,C1000,Preservation,Trust,1,100000-499999,N,31452,1
7,10566033,ISRAEL EMERGENCY ALLIANCE,T3,Independent,C2000,Preservation,Trust,1,10M-50M,N,7508025,1
8,10570430,ARAMCO BRATS INC,T7,Independent,C1000,ProductDev,Trust,1,1-9999,N,94389,1
9,10571689,INTERNATIONAL ASSOCIATION OF FIRE FIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
df1 = df_raw.drop(columns=['EIN','NAME'])
df1.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [3]:
# Determine the number of unique values in each column.
df1.nunique()

Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [4]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
df1.APPLICATION_TYPE.value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [5]:
# Choose a cutoff value and create a list of application types to be replaced
# Cutoff less than 100
# use the variable name `application_types_to_replace`
application_types_to_replace = ['T13','T12','T2','T25','T14','T29','T15','T17']

# Replace in dataframe
for app in application_types_to_replace:
    df1['APPLICATION_TYPE'] = df1['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
df1['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
Other,120


In [6]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
df1.CLASSIFICATION.value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [7]:
# You may find it helpful to look at CLASSIFICATION value counts >1
# Code assisted by chatGPT
df1['CLASSIFICATION'].value_counts()[df1['CLASSIFICATION'].value_counts() > 1]


Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
C7000,777
C1700,287
C4000,194
C5000,116
C1270,114


In [8]:
#Store the list of application types to be replaced
app_type_totals = df1['CLASSIFICATION'].value_counts()
replace_types = app_type_totals.loc[app_type_totals < 300].index.tolist()
replace_types

['C1700',
 'C4000',
 'C5000',
 'C1270',
 'C2700',
 'C2800',
 'C7100',
 'C1300',
 'C1280',
 'C1230',
 'C1400',
 'C7200',
 'C2300',
 'C1240',
 'C8000',
 'C7120',
 'C1500',
 'C1800',
 'C6000',
 'C1250',
 'C8200',
 'C1238',
 'C1278',
 'C1235',
 'C1237',
 'C7210',
 'C2400',
 'C1720',
 'C4100',
 'C1257',
 'C1600',
 'C1260',
 'C2710',
 'C0',
 'C3200',
 'C1234',
 'C1246',
 'C1267',
 'C1256',
 'C2190',
 'C4200',
 'C2600',
 'C5200',
 'C1370',
 'C1248',
 'C6100',
 'C1820',
 'C1900',
 'C1236',
 'C3700',
 'C2570',
 'C1580',
 'C1245',
 'C2500',
 'C1570',
 'C1283',
 'C2380',
 'C1732',
 'C1728',
 'C2170',
 'C4120',
 'C8210',
 'C2561',
 'C4500',
 'C2150']

In [9]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
# SEE PREVIOUS CELL
classifications_to_replace = replace_types

# Replace in dataframe
for cls in classifications_to_replace:
    df1['CLASSIFICATION'] = df1['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
df1['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
Other,1484
C7000,777


In [10]:
# Convert categorical data to numeric with `pd.get_dummies`
df2 = pd.get_dummies(df1,dtype=int)
df2.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [11]:
# Split our preprocessed data into our features and target arrays
X = df2.drop(columns = ["IS_SUCCESSFUL"])
y = df2.IS_SUCCESSFUL

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y) # stratify=True maintains target class percentages
print(X_train.shape, X_test.shape)

(25724, 45) (8575, 45)


In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  YOUR CODE GOES HERE

nn_mod1 = tf.keras.models.Sequential()

# First hidden layer
nn_mod1.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=len(X.columns)))


# Second hidden layer
nn_mod1.add(tf.keras.layers.Dense(units=3, activation="tanh"))

# Output layer
nn_mod1.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_mod1.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Compile the model
nn_mod1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [15]:
# Train the model
fit_model = nn_mod1.fit(X_train_scaled, y_train, epochs=10)

Epoch 1/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 9ms/step - accuracy: 0.6072 - loss: 0.6549
Epoch 2/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 0.7083 - loss: 0.5788
Epoch 3/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7281 - loss: 0.5627
Epoch 4/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7311 - loss: 0.5564
Epoch 5/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7274 - loss: 0.5575
Epoch 6/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7288 - loss: 0.5534
Epoch 7/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7292 - loss: 0.5554
Epoch 8/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7307 - loss: 0.5524
Epoch 9/10
[1m804/804[0m [32m━━━━━━━

In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_mod1.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7270 - loss: 0.5581
Loss: 0.55808025598526, Accuracy: 0.7269970774650574


In [17]:
# Improving the Model (Iteration 2)
# More Nodes, 1 More Layer
nn_mod2 = tf.keras.models.Sequential()

# First hidden layer
nn_mod2.add(tf.keras.layers.Dense(units=9, activation="relu", input_dim=len(X.columns)))


# Second hidden layer
nn_mod2.add(tf.keras.layers.Dense(units=7, activation="relu"))

# Third hidden layer
nn_mod2.add(tf.keras.layers.Dense(units=5, activation="relu"))

# Output layer
nn_mod2.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_mod2.summary()

In [18]:
# Compile the model
nn_mod2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [19]:
# Train the model
fit_model = nn_mod2.fit(X_train_scaled, y_train, epochs=10)

Epoch 1/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.6162 - loss: 0.6538
Epoch 2/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7260 - loss: 0.5613
Epoch 3/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7316 - loss: 0.5514
Epoch 4/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7341 - loss: 0.5480
Epoch 5/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7274 - loss: 0.5535
Epoch 6/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7298 - loss: 0.5537
Epoch 7/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7335 - loss: 0.5504
Epoch 8/10
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7326 - loss: 0.5459
Epoch 9/10
[1m804/804[0m [32m━━━━━━━━

In [20]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_mod2.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7284 - loss: 0.5540
Loss: 0.5540499687194824, Accuracy: 0.728396475315094


In [21]:
# Iteration 3: Trying Sigmoid activation, more nodes
nn_mod3 = tf.keras.models.Sequential()

# First hidden layer
nn_mod3.add(tf.keras.layers.Dense(units=50, activation="sigmoid", input_dim=len(X.columns)))

# Second hidden layer
nn_mod3.add(tf.keras.layers.Dense(units=35, activation="sigmoid"))

# Third hidden layer
nn_mod3.add(tf.keras.layers.Dense(units=15, activation="sigmoid"))

# Output layer
nn_mod3.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_mod3.summary()

In [22]:
# Compile the model
nn_mod3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [23]:
# Train the model
fit_model = nn_mod3.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6413 - loss: 0.6365
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7263 - loss: 0.5771
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7303 - loss: 0.5699
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7316 - loss: 0.5678
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7314 - loss: 0.5622
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7353 - loss: 0.5539
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7299 - loss: 0.5574
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7325 - loss: 0.5519
Epoch 9/100
[1m804/804[0m [32

In [24]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_mod3.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7287 - loss: 0.5569
Loss: 0.5568776726722717, Accuracy: 0.7287463545799255


In [25]:
# Improvements are only marginal
# Improve data engineering

In [26]:
df_raw.EIN.value_counts()

Unnamed: 0_level_0,count
EIN,Unnamed: 1_level_1
10520599,1
626274659,1
630475330,1
630416100,1
630357662,1
...,...
383880377,1
383876652,1
383874900,1
383871370,1


In [27]:
df_raw.NAME.value_counts()

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,408
...,...
ST LOUIS SLAM WOMENS FOOTBALL,1
AIESEC ALUMNI IBEROAMERICA CORP,1
WEALLBLEEDRED ORG INC,1
AMERICAN SOCIETY FOR STANDARDS IN MEDIUMSHIP & PSYCHICAL INVESTIGATI,1


In [28]:
# Best guess, some applicants are chapters or franchises of a larger org, and therefore have additional support


In [29]:
df_raw['name_counts'] = df_raw.groupby('NAME')['NAME'].transform('count')
df_raw.head(20)

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,name_counts
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1,8
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,1
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1,1
5,10556855,MINORITY ORGAN & TISSUE TRANSPLANT & EDUCATION...,T3,Independent,C1200,Preservation,Trust,1,0,N,5000,1,1
6,10558440,FRIENDS OF ARTS COUNCIL OF GREATER DENHAM SPRI...,T3,Independent,C1000,Preservation,Trust,1,100000-499999,N,31452,1,1
7,10566033,ISRAEL EMERGENCY ALLIANCE,T3,Independent,C2000,Preservation,Trust,1,10M-50M,N,7508025,1,1
8,10570430,ARAMCO BRATS INC,T7,Independent,C1000,ProductDev,Trust,1,1-9999,N,94389,1,1
9,10571689,INTERNATIONAL ASSOCIATION OF FIRE FIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0,91


In [30]:
# Perform similar data cleaning on raw file
df_new = df_raw.drop(columns=['EIN','NAME'])

for app in application_types_to_replace:
    df_new['APPLICATION_TYPE'] = df_new['APPLICATION_TYPE'].replace(app,"Other")

for cls in classifications_to_replace:
    df_new['CLASSIFICATION'] = df_new['CLASSIFICATION'].replace(cls,"Other")

df_new = pd.get_dummies(df_new,dtype=int)

# Split our preprocessed data into our features and target arrays
X = df_new.drop(columns = ["IS_SUCCESSFUL"])
y = df_new.IS_SUCCESSFUL

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y) # stratify=True maintains target class percentages
print(X_train.shape, X_test.shape)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

(25724, 46) (8575, 46)


In [32]:
# Iteration 3 with new data
nn_mod3 = tf.keras.models.Sequential()

# First hidden layer
nn_mod3.add(tf.keras.layers.Dense(units=50, activation="sigmoid", input_dim=len(X.columns)))

# Second hidden layer
nn_mod3.add(tf.keras.layers.Dense(units=35, activation="sigmoid"))

# Third hidden layer
nn_mod3.add(tf.keras.layers.Dense(units=15, activation="sigmoid"))

# Output layer
nn_mod3.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_mod3.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [33]:
# Compile the model
nn_mod3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [34]:
# Train the model
fit_model = nn_mod3.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6419 - loss: 0.6305
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7304 - loss: 0.5491
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7331 - loss: 0.5438
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7294 - loss: 0.5397
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7310 - loss: 0.5344
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7318 - loss: 0.5325
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7377 - loss: 0.5249
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7341 - loss: 0.5246
Epoch 9/100
[1m804/804[0m [32

In [36]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_mod3.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7524 - loss: 0.5050
Loss: 0.5049883723258972, Accuracy: 0.7524198293685913


In [42]:
# Export our model to HDF5 file
nn_mod3.save("AlphabetSoupCharity.h5")



In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Analysis of Neural Network Experiment

The purpose of this analysis is to use machine learning, specific deep learning, to create a prediction algorithm for the nonprofit Alphabet Soup. The target of this prediction is whether or not a venture will be successful.

## Overview

The data provided included over 30,000 rows with the features of EIN, NAME, APPLICATION_TYPE, AFFILIATION, CLASSIFICIATION, USE_CASE, ORGANIZATION, STATUS, INCOME_AMT, SPECIAL_CONSIDERATIONS, ASK_AMT, and then the success or failure of the venture. While EIN and NAME were initially assumed to just be metadata, recognizing that there was valuable information in the name (as many were repeated and are likely chapters of a larger franchise) improved the model. The neural network included 3 layers and dozens of nodes, the development of which is explored further in the results section.

## Results

To preprocess the data for the ultimate final model, the following was performed:


*   A new column "name_counts" was created to reflect how many times a name recurred
*   The "IS_SUCCESSFUL" column served as the target of the dataset, with a binary output.
*   The features of the dataset were APPLICATION_TYPE, AFFILIATION, CLASSIFICIATION, USE_CASE, ORGANIZATION, STATUS, INCOME_AMT, SPECIAL_CONSIDERATIONS, ASK_AMT, name_counts. A standard scaler was applied.
*   With the inclusion of the "name_counts" column, "EIN" and "NAME" could be treated as metadata and dropped from the feature array.

The following considerations were made in compliling, training, and evaluating the model

*   The final version used three layers with sigmoid activations and 50, 35, and 15 neurons respectively. This was done after seeing only marginal improvement in changing the number of neurons in early versions of the model by single digits. A general "funnel shape" that reduced the number of neurons for each layer was chosen.
*   Adding additional layers and neurons alone was note able to achieve the desired target performance; this came about through better data engineering and deriving value from the column that had previously been dropped.
*   Multiple changes were made inbetween models: in number of layers, number of neurons, type of activation. Ultimately data dengineering proved to be the real solution, model engineering.

## Summary
The model ultimately acheived an accuracy of 75.2%, which is still too low to recommend for productionalization. However, this did prove to be a useful excercise in neural network configuation. A more realistic solution to the problem would have been classical supervised learning, as this a simply a classification problem. This would provide less complexity and be less computationally intensive. Next steps would be to recreate this experiment for different classical learning models, including a logistic regression, K nearest neighbors, a random forest, and a boosted classifier like adaboost or gradient boost.

