<a href="https://colab.research.google.com/github/kilgorjn/CS_5300_AI/blob/main/CS_5300_Abalone_Phase_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phase III 

In [4]:
import urllib.request
import os
import pandas as pd
from sklearn import preprocessing
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import timeit


if 'abalone.data' not in os.listdir():
  !wget https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data
if 'abalone.names' not in os.listdir():
  !wget https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names


feature_names = ['Sex','Length','Diameter','Height','Whole Weight','Shucked Weight','Viscera Weight','Shell Weight']
target_name = 'Rings'
names = feature_names + [target_name]
df = pd.read_csv('abalone.data', names=names) 

#convert the 'Sex' feature from nomimal to numerical 
df['Sex'] = df['Sex'].astype('category')
df['Sex'] = df['Sex'].cat.codes

#Convert the 'Rings' column to a binary 'Age', where 0 means 'young', and 1 means 'old'
df['Age'] = pd.DataFrame(np.where(df[target_name]<=9,0,1),columns=['Age'])
print(df.head())

#drop the 'Rings' feature, it is no longer needed
df.drop(labels='Rings', inplace=True, axis=1)
df.describe()  #un-normalized data


for column in ['Length','Diameter','Height','Whole Weight','Shucked Weight','Viscera Weight','Shell Weight']:
  df[column] = (df[column]-df[column].min())/(df[column].max()-df[column].min())
df.describe()  #normalized data (all columns except Sex and Age)


   Sex  Length  Diameter  Height  ...  Viscera Weight  Shell Weight  Rings  Age
0    2   0.455     0.365   0.095  ...          0.1010         0.150     15    1
1    2   0.350     0.265   0.090  ...          0.0485         0.070      7    0
2    0   0.530     0.420   0.135  ...          0.1415         0.210      9    0
3    2   0.440     0.365   0.125  ...          0.1140         0.155     10    1
4    1   0.330     0.255   0.080  ...          0.0395         0.055      7    0

[5 rows x 10 columns]


Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,1.052909,0.606746,0.593078,0.123466,0.292808,0.241,0.237121,0.236503,0.498204
std,0.82224,0.162288,0.16679,0.037015,0.173681,0.149269,0.144324,0.138717,0.500057
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.506757,0.495798,0.10177,0.155658,0.124412,0.122449,0.128052,0.0
50%,1.0,0.635135,0.621849,0.123894,0.282451,0.225286,0.22449,0.231689,0.0
75%,2.0,0.72973,0.714286,0.146018,0.40765,0.33692,0.332456,0.326358,1.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
#normalize the data in-place
for column in ['Length','Diameter','Height','Whole Weight','Shucked Weight','Viscera Weight','Shell Weight']:
  df[column] = (df[column]-df[column].min())/(df[column].max()-df[column].min())
df.describe()  #normalized data (all columns except Sex and Age)


Unnamed: 0,Sex,Length,Diameter,Height,Whole Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,1.052909,0.606746,0.593078,0.123466,0.292808,0.241,0.237121,0.236503,0.498204
std,0.82224,0.162288,0.16679,0.037015,0.173681,0.149269,0.144324,0.138717,0.500057
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.506757,0.495798,0.10177,0.155658,0.124412,0.122449,0.128052,0.0
50%,1.0,0.635135,0.621849,0.123894,0.282451,0.225286,0.22449,0.231689,0.0
75%,2.0,0.72973,0.714286,0.146018,0.40765,0.33692,0.332456,0.326358,1.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split the data into training, validation,  and test sets

In [6]:
np.random.seed(0)  #set a seed to reproduce train and test sets over multiple runs
# msk = np.random.rand(len(df)) < 0.8

print('before shuffle')
print(df.head())
df = df.sample(frac=1, axis=0).reset_index(drop=True)
print('after shuffle')
print(df.head())


train_size = round(df.shape[0]*.7)
val_size = round(df.shape[0]*.2)
test_size = round(df.shape[0]*.1)

print(f'train_size = {train_size}:  val_size = {val_size}  test_size={test_size}')

df_x_train = df[0:train_size]
df_y_train = df_x_train['Age']
df_x_train = df_x_train.drop('Age', axis=1)

df_x_val = df[train_size:train_size+val_size]
df_y_val = df_x_val['Age']
df_x_val = df_x_val.drop('Age', axis=1)


df_x_test = df[train_size+val_size:train_size+val_size+test_size]
df_y_test = df_x_test['Age']
df_x_test = df_x_test.drop('Age', axis=1)


print(f'df shape: {df.shape}')
print(f'training shape(X): {df_x_train.shape}')
print(f'training shape(Y): {df_y_train.shape}')

print(f'validation shape(X): {df_x_val.shape}')
print(f'validation shape(Y): {df_y_val.shape}')


print(f'testing shape(X): {df_x_test.shape}')
print(f'testing shape(Y): {df_y_test.shape}')

print(df_x_train.head())
print(df_x_val.head())
print(df_x_test.head())


before shuffle
   Sex    Length  Diameter  ...  Viscera Weight  Shell Weight  Age
0    2  0.513514  0.521008  ...        0.132324      0.147982    1
1    2  0.371622  0.352941  ...        0.063199      0.068261    0
2    0  0.614865  0.613445  ...        0.185648      0.207773    0
3    2  0.493243  0.521008  ...        0.149440      0.152965    1
4    1  0.344595  0.336134  ...        0.051350      0.053313    0

[5 rows x 9 columns]
after shuffle
   Sex    Length  Diameter  ...  Viscera Weight  Shell Weight  Age
0    2  0.641892  0.621849  ...        0.319289      0.332337    1
1    1  0.574324  0.579832  ...        0.187623      0.191330    0
2    2  0.736486  0.714286  ...        0.491771      0.314898    1
3    1  0.195946  0.184874  ...        0.015142      0.018435    0
4    2  0.770270  0.747899  ...        0.493746      0.374689    1

[5 rows x 9 columns]
train_size = 2924:  val_size = 835  test_size=418
df shape: (4177, 9)
training shape(X): (2924, 8)
training shape(Y): (2924

#  Create Utility Functions
This cell just has utility functions for use evaluating different architectures

In [7]:




METRICS = [
      # keras.metrics.TruePositives(name='tp'),
      # keras.metrics.FalsePositives(name='fp'),
      # keras.metrics.TrueNegatives(name='tn'),
      # keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      # # keras.metrics.Precision(name='precision'),
      # keras.metrics.Recall(name='recall'),
      # keras.metrics.AUC(name='auc'),
]

###########################################################
# Build a model based on the provided architecture
###########################################################
def build_model(architecture:[], hidden_activation='relu', output_activation='sigmoid'):
  model = Sequential()
  model.add(Input(shape=df_x_train.shape[1]))
  # the architecture is a list of node_counts.  
  # Create a hidden layer w/ the specified 
  # number of nodes for each layer in the architecture.
  for nodes in architecture:
    model.add(Dense(nodes, activation=hidden_activation))  
  model.add(Dense(1, activation=output_activation))

  model.compile(
      optimizer=keras.optimizers.RMSprop(),
      loss = keras.losses.BinaryCrossentropy(),
      metrics = METRICS
      )
  return model





#########################################
# Function to train a model and return the training history.
# batch_size and epochs have defaults, but can be overridden
#########################################
def train_model(model, train_x, train_y, val_x, val_y, identifier, batch_size=64, epochs=256, use_early_stop=False):
  # print('--------Training model: ')
  # print(model.summary())
  print(f'training model {identifier}')

  callback_earlystop = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=1)
  callbacks = []
  if use_early_stop:
    callbacks.append(callback_earlystop)

  return model.fit(x=train_x, 
                    y=train_y, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=(val_x, val_y),
                    verbose=0,
                    callbacks=callbacks
                    )

def evaluate_model(model:tf.keras.Model, x_data, y_data):
  return model.evaluate(x_data, y_data, batch_size=128)
  


# Phase III



## Recursive Feature Elimination

In [None]:
def recursive_feature_elimination():
  print(feature_names)
  for feature in feature_names:
    print(feature)
    val_accuracies = {}
    accuracies = {}
    # create new train and validation dataframes with only a single feature
    df_rde_x_train = df_x_train[feature].to_numpy()
    df_rde_x_val = df_x_val[feature].to_numpy()

    print(df_rde_x_train.shape)
  
    model = Sequential()
    model.add(Input(shape=1))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
      optimizer=keras.optimizers.RMSprop(),
      loss = keras.losses.BinaryCrossentropy(),
      metrics = METRICS)    

    history = model.fit(x=df_rde_x_train, 
      y=df_y_train, 
      batch_size=64, 
      epochs=64, 
      validation_data=(df_rde_x_val, df_y_val),
      verbose=0,
      callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=1)])
    val_accuracies[feature]=history.history['val_accuracy'][-1]
    accuracies[feature]=history.history['accuracy'][-1]
    print(feature,',',history.history['val_accuracy'][-1],',',history.history['accuracy'][-1])
  return (val_accuracies, accuracies)



(val_accuracies, accuracies) = recursive_feature_elimination()
print(val_accuracies)
print(accuracies)

['Sex', 'Length', 'Diameter', 'Height', 'Whole Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']
Sex
(2924,)
Epoch 00030: early stopping
Sex , 0.6898203492164612 , 0.7058823704719543
Length
(2924,)
Epoch 00037: early stopping
Length , 0.6970059871673584 , 0.7250341773033142
Diameter
(2924,)
Epoch 00055: early stopping
Diameter , 0.7101796269416809 , 0.7308481335639954
Height
(2924,)
Epoch 00020: early stopping
Height , 0.7209580540657043 , 0.7482900023460388
Whole Weight
(2924,)
Epoch 00032: early stopping
Whole Weight , 0.7101796269416809 , 0.7322161197662354
Shucked Weight
(2924,)
Shucked Weight , 0.682634711265564 , 0.6925444602966309
Viscera Weight
(2924,)
Epoch 00031: early stopping
Viscera Weight , 0.71257483959198 , 0.7393980622291565
Shell Weight
(2924,)
Epoch 00046: early stopping
Shell Weight , 0.7341317534446716 , 0.7609438896179199
{'Shell Weight': 0.7341317534446716}
{'Shell Weight': 0.7609438896179199}


From the data above, the features have been sorted by validation accuracy (small to large).  The sorted features are: 
'Shucked Weight' ,'Sex' ,'Length' ,'Viscera Weight' ,'Whole Weight' ,'Diameter' ,'Height' ,'Shell Weight'

# Iteratively remove 1 feature at a time

In [None]:



features_sorted_small_to_large = ['Shucked Weight' ,'Sex' ,'Length' ,'Viscera Weight' ,'Whole Weight' ,'Diameter' ,'Height' ,'Shell Weight']

print(df_x_train.head())

df_rm_x_train = df_x_train.copy()
df_rm_x_train = df_rm_x_train[features_sorted_small_to_large]

df_rm_x_val = df_x_val.copy()
df_rm_x_val = df_rm_x_val[features_sorted_small_to_large]

# 
print(df_rm_x_train.head())

feature_removed = None
for feature in features_sorted_small_to_large:

  
  model = Sequential()
  model.add(Input(shape=df_rm_x_train.shape[1]))
  model.add(Dense(50, activation='relu'))
  model.add(Dense(50, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  model.compile(
    optimizer=keras.optimizers.RMSprop(),
    loss = keras.losses.BinaryCrossentropy(),
    metrics = METRICS)    

  history = model.fit(x=df_rm_x_train, 
    y=df_y_train, 
    batch_size=64, 
    epochs=64, 
    validation_data=(df_rm_x_val, df_y_val),
    verbose=0,
    callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=1)])

  print(df_rm_x_train.columns.values)
  print(f'After removing {feature_removed}',',',history.history['val_accuracy'][-1],',',history.history['accuracy'][-1])

  df_rm_x_train.drop([feature], axis=1, inplace=True)
  df_rm_x_val.drop([feature], axis=1, inplace=True)
  feature_removed = feature

# print(df_rm_x_train.head())
# print(df_x_val.head())






   Sex    Length  Diameter  ...  Shucked Weight  Viscera Weight  Shell Weight
0    2  0.641892  0.621849  ...        0.185945        0.319289      0.332337
1    1  0.574324  0.579832  ...        0.174849        0.187623      0.191330
2    2  0.736486  0.714286  ...        0.353732        0.491771      0.314898
3    1  0.195946  0.184874  ...        0.013786        0.015142      0.018435
4    2  0.770270  0.747899  ...        0.452253        0.493746      0.374689

[5 rows x 8 columns]
   Shucked Weight  Sex    Length  ...  Diameter    Height  Shell Weight
0        0.185945    2  0.641892  ...  0.621849  0.137168      0.332337
1        0.174849    1  0.574324  ...  0.579832  0.106195      0.191330
2        0.353732    2  0.736486  ...  0.714286  0.137168      0.314898
3        0.013786    1  0.195946  ...  0.184874  0.048673      0.018435
4        0.452253    2  0.770270  ...  0.747899  0.154867      0.374689

[5 rows x 8 columns]
Epoch 00060: early stopping
['Shucked Weight' 'Sex' 'Len

# Remove the 'Length' feature and compare

In [11]:
#Remove just the 'length' feature and compare the all features.

features_sorted_small_to_large = ['Shucked Weight' ,'Sex' ,'Length' ,'Viscera Weight' ,'Whole Weight' ,'Diameter' ,'Height' ,'Shell Weight']
features_to_remove = ['Length']

print(df_x_train.head())

df_rm_x_train = df_x_train.copy()
df_rm_x_train = df_rm_x_train[features_sorted_small_to_large]

df_rm_x_val = df_x_val.copy()
df_rm_x_val = df_rm_x_val[features_sorted_small_to_large]

# 
print(df_rm_x_train.head())

model = Sequential()
model.add(Input(shape=df_rm_x_train.shape[1]))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
  optimizer=keras.optimizers.RMSprop(),
  loss = keras.losses.BinaryCrossentropy(),
  metrics = METRICS)    

history = model.fit(x=df_rm_x_train, 
  y=df_y_train, 
  batch_size=64, 
  epochs=64, 
  validation_data=(df_rm_x_val, df_y_val),
  verbose=1,
  callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=1)])



#drop the Length feature and retrain


df_rm_x_train.drop(['Length'], axis=1, inplace=True)
df_rm_x_val.drop(['Length'], axis=1, inplace=True)


model = Sequential()
model.add(Input(shape=df_rm_x_train.shape[1]))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
  optimizer=keras.optimizers.RMSprop(),
  loss = keras.losses.BinaryCrossentropy(),
  metrics = METRICS)    

history = model.fit(x=df_rm_x_train, 
  y=df_y_train, 
  batch_size=64, 
  epochs=64, 
  validation_data=(df_rm_x_val, df_y_val),
  verbose=1,
  callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=1)])







   Sex    Length  Diameter  ...  Shucked Weight  Viscera Weight  Shell Weight
0    2  0.641892  0.621849  ...        0.185945        0.319289      0.332337
1    1  0.574324  0.579832  ...        0.174849        0.187623      0.191330
2    2  0.736486  0.714286  ...        0.353732        0.491771      0.314898
3    1  0.195946  0.184874  ...        0.013786        0.015142      0.018435
4    2  0.770270  0.747899  ...        0.452253        0.493746      0.374689

[5 rows x 8 columns]
   Shucked Weight  Sex    Length  ...  Diameter    Height  Shell Weight
0        0.185945    2  0.641892  ...  0.621849  0.137168      0.332337
1        0.174849    1  0.574324  ...  0.579832  0.106195      0.191330
2        0.353732    2  0.736486  ...  0.714286  0.137168      0.314898
3        0.013786    1  0.195946  ...  0.184874  0.048673      0.018435
4        0.452253    2  0.770270  ...  0.747899  0.154867      0.374689

[5 rows x 8 columns]
Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
E