## Preprocessing

In [7]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [8]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
new_df = application_df.drop(['EIN','NAME'], axis=1)
new_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [9]:
# Determine the number of unique values in each column.
sorted = new_df.nunique().sort_values(ascending=False)
sorted

Unnamed: 0,0
ASK_AMT,8747
CLASSIFICATION,71
APPLICATION_TYPE,17
INCOME_AMT,9
AFFILIATION,6
USE_CASE,5
ORGANIZATION,4
STATUS,2
SPECIAL_CONSIDERATIONS,2
IS_SUCCESSFUL,2


In [10]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
app_type_counts = new_df['APPLICATION_TYPE'].value_counts()
app_type_counts

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [11]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`

application_types_to_replace = []
for app_type, count in app_type_counts.items():
    if count <= 500:
        application_types_to_replace.append(app_type)
# Replace in dataframe
for app in application_types_to_replace:
    new_df['APPLICATION_TYPE'] = new_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
new_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [12]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
classification_type_counts = new_df['CLASSIFICATION'].value_counts()
classification_type_counts

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [13]:
# You may find it helpful to look at CLASSIFICATION value counts >1

new_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [14]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classification_types_to_replace = []
for class_type, count in classification_type_counts.items():
    if count < 1000:
        classification_types_to_replace.append(class_type)

# Replace in dataframe
for classification in classification_types_to_replace:
    new_df['CLASSIFICATION'] = new_df['CLASSIFICATION'].replace(classification,"Other")

# Check to make sure replacement was successful
new_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [15]:
# Convert categorical data to numeric with `pd.get_dummies
new_df_cat = new_df.dtypes[new_df.dtypes == "object"].index.tolist()

encoded_df = pd.get_dummies(new_df, columns=['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS'],dtype=int)


In [16]:
# Split our preprocessed data into our features and target arrays
y = encoded_df["IS_SUCCESSFUL"].values
X = encoded_df.drop(["IS_SUCCESSFUL"], axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [17]:
encoded_df.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,108590,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,5000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,6692,1,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,142590,1,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [19]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
print(len(X_train[0]))

43


In [21]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [22]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6903 - loss: 0.6297
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7281 - loss: 0.5623
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7266 - loss: 0.5597
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7258 - loss: 0.5587
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7294 - loss: 0.5525
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7324 - loss: 0.5518
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7257 - loss: 0.5571
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7305 - loss: 0.5525
Epoch 9/100
[1m804/804[0m [32

In [23]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 3ms/step - accuracy: 0.7240 - loss: 0.5573
Loss: 0.5573158860206604, Accuracy: 0.7239649891853333


In [24]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity.h5")



## **Report**

The purpose of the analysis is to provide a tool that harnesses the power of machine learning in evaluating applicants for funding in response to a request from a nonprofit foundation.

**Data Processing**:
- Target: The "IS_SUCCESSFUL" columnn is selected for the target variable as it indicates whether the organization was found to be eligible for funding.
- Features : In the first model  "APPLICATION_TYPE", "AFFILIATION", "CLASSIFICATION", "USE_CASE", "ORGANIZATION", "STATUS", "INCOME_AMT", 'SPECIAL_CONSIDERATIONS', "ASK_AMT" columns were selected as features.
- Columns to be removed: The "EIN"
 and the "NAME"columns, containing identifiers and organization names for each row of data, do not constitute either targets or features and should be excluded from the dataframe.

 **Compiling, Training, and Evaluating the Model**
**Number of Neurons, Hidden Layers and Activation Functions Used in the Model**
 - Initial configuration before optimization:


number of input features = 43
hidden nodes layer1 =  8
hidden nodes layer2 = 5
hidden layer 1 activation function= relu
hidden layer 2 activation function= relu
output layer activation function= sigmoid
number of epochs = 100

Loss: 0.5573158860206604, Accuracy: 0.7239649891853333

- Optimization

 Before Optimization I removed the "Status" and "Special Considerations" columns from the dataframe to reduce the number of features. This adjustment resulted in 40 features.

 The configurations used in each attempt are given below:

**1st attempt**:

number of input features = 40
hidden nodes layer1 =  1
hidden nodes layer2 = 1
hidden layer 1 activation function= relu
output layer activation function= sigmoid
number of epochs = 50
accuracy: 0.7130 - loss: 0.5896

**2nd attempt**:

number of input features = 40
hidden nodes layer1 =  1
hidden nodes layer2 = 1
hidden layer 1 activation function= relu
**hidden layer 2 activation function= relu**
output layer activation function= sigmoid
number of epochs = 50

accuracy: accuracy: 0.6928 - loss: 0.6032

The accuracy dropped when I added the second layer with one neuron.

**3rd attempt**:

number of input features = 40
**hidden nodes layer1 =  3**
**hidden nodes layer2 = 6**
hidden layer 1 activation function= relu
hidden layer 2 activation function= relu
output layer activation function= sigmoid
number of epochs = 50

accuracy: 0.7210 - loss: 0.5615

Adding neurons to each hidden layer improved accuracy.

**4th attempt**:

number of input features = 40
hidden nodes layer1 =  3
hidden nodes layer2 = 6
hidden layer 1 activation function= relu
hidden layer 2 activation function= relu
output layer activation function= sigmoid

**number of epochs = 100**

accuracy: 0.7265 - loss: 0.5537

**Conclusion**

While the none of the combinations rendered an accuracy of higher than 75%, the following factors were found to affect accuracy: the number of neurons in each hidden layer and the number epochs. The highest accuracy was achieved in the fourth attempt with an increased number of neurons as well as a higher number of epochs. 

Due to the low level of accuracy achieved despite numerous attempts a different machine learning model such as supervised learning might be a better fit for this problem. While it is impossible to guarantee higher accuracy before testing alternative models on the dataset, the random forest model might result in improved accuracy for this problem. 


















