<br>

####  Modeling with both proximity and conditionals:  Neural Nets!

In [1]:
#
#   We demonstrate _both_ clasification + regression for bitwise functions:

#     + function #1:  MAJ, the "majority" function
#                     three bits input, the most-appearing bit is the output 

#     + function #2:  XOR, the "xor" or "odd # of 1's" function 
#                     three bits input, output is their sum%2 
#                     that is, 1 if there is an odd # of 1's, 0 if an even # of 1's
#   
#   From here, we'll use NNets for the births and iris datasets
#
#

In [2]:
# libraries!
import numpy as np      # numpy is Python's "array" library
import pandas as pd     # Pandas is Python's "data" library ("dataframe" == spreadsheet)

In [22]:
# let's read in our data...
# 
# for read_csv, use header=0 when row 0 is a header row
# 
filename = 'xor_cleaned.csv'
# filename = 'maj_cleaned.csv'
df = pd.read_csv(filename, header=0)   # encoding="latin1" et al.
print(f"{filename} : file read into a pandas dataframe.")

xor_cleaned.csv : file read into a pandas dataframe.


In [23]:
#
# let's look at our pandas dataframe  
#
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   bit1       8 non-null      int64
 1   bit2       8 non-null      int64
 2   bit3       8 non-null      int64
 3   outputbit  8 non-null      int64
dtypes: int64(4)
memory usage: 384.0 bytes


In [25]:
#
# let's keep our column names in variables, for reference
#
COLUMNS = df.columns            # "list" of columns
print(f"COLUMNS is {COLUMNS}\n")  
  # It's a "pandas" list, called an Index
  # use it just as a Python list of strings:
print(f"COLUMNS[0] is {COLUMNS[0]}\n")

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}")


COLUMNS is Index(['bit1', 'bit2', 'bit3', 'outputbit'], dtype='object')

COLUMNS[0] is bit1

COL_INDEX is {'bit1': 0, 'bit2': 1, 'bit3': 2, 'outputbit': 3}


In [26]:
# all of scikit-learn's ML routines need numbers, not strings
#   ... even for categories/classifications (like species!)
#   so, we will convert the flower-species to numbers:

SPECIES = ['zero','one']   # int to str
SPECIES_INDEX = {'zero':0,'one':1}  # str to int

def convert_species(speciesname):
    """ return the species index (a unique integer/category) """
    #print(f"converting {speciesname}...")
    return SPECIES_INDEX[speciesname]

# Let's try it out...
for name in SPECIES:
    print(f"{name} maps to {convert_species(name)}")

zero maps to 0
one maps to 1


In [27]:
#
# let's convert our dataframe to a numpy array, named A
#    Our ML library, scikit-learn operates entirely on numpy arrays.
#
A = df.to_numpy()    
A = A.astype('float64')   # and make things floating-point
print(A)

[[0. 0. 0. 0.]
 [0. 0. 1. 1.]
 [0. 1. 0. 1.]
 [0. 1. 1. 0.]
 [1. 0. 0. 1.]
 [1. 0. 1. 0.]
 [1. 1. 0. 0.]
 [1. 1. 1. 1.]]


In [28]:
#
# Let's make sure things are floats! (Important for NNets!)
# 

A = A.astype('float64')   # and make things floating-point
print(A)

[[0. 0. 0. 0.]
 [0. 0. 1. 1.]
 [0. 1. 0. 1.]
 [0. 1. 1. 0.]
 [1. 0. 0. 1.]
 [1. 0. 1. 0.]
 [1. 1. 0. 0.]
 [1. 1. 1. 1.]]


In [29]:
#
# Let's split into features and labels (species/categories):
#

X_all = A[:,0:3].copy()   # We make a copy so we don't change A
y_all = A[:,3].copy()

def ascii_table(X,y):
    """ print a table of binary inputs and outputs """
    print(f"{'input ':>18s} -> {'pred':<5s} {'des.':<5s}") 
    for i in range(len(y)):
        print(f"{X[i,:]!s:>18s} -> {'?':<5s} {y[i]:<5.0f}")   # !s is str ...
        
ascii_table(X_all,y_all)

            input  -> pred  des. 
        [0. 0. 0.] -> ?     0    
        [0. 0. 1.] -> ?     1    
        [0. 1. 0.] -> ?     1    
        [0. 1. 1.] -> ?     0    
        [1. 0. 0.] -> ?     1    
        [1. 0. 1.] -> ?     0    
        [1. 1. 0.] -> ?     0    
        [1. 1. 1.] -> ?     1    


In [32]:
#
# we can scramble the remaining data if we want to...
#    we don't want to, at the moment...
#

SCRAMBLE = False   # easy to change...

if SCRAMBLE == True:
    NUM_ROWS = len(y_def)
    indices = np.random.permutation(NUM_ROWS)  # this scrambles the data each time
    X_all = X_all[indices]
    y_all = y_all[indices]
else:
    X_all = X_all  # don't scramble
    y_all = y_all

ascii_table(X_all,y_all)

            input  -> pred  des. 
        [0. 0. 0.] -> ?     0    
        [0. 0. 1.] -> ?     1    
        [0. 1. 0.] -> ?     1    
        [0. 1. 1.] -> ?     0    
        [1. 0. 0.] -> ?     1    
        [1. 0. 1.] -> ?     0    
        [1. 1. 0.] -> ?     0    
        [1. 1. 1.] -> ?     1    


In [33]:
#
# here, we _cheat_ by letting the full dataset 
# be _both_ the training and testing sets.  (There are too few otherwise!)
#
X_train = X_all.copy()
y_train = y_all.copy()

X_test = X_all.copy()
y_test = y_all.copy()

ascii_table(X_train,y_train)    # same as above

            input  -> pred  des. 
        [0. 0. 0.] -> ?     0    
        [0. 0. 1.] -> ?     1    
        [0. 1. 0.] -> ?     1    
        [0. 1. 1.] -> ?     0    
        [1. 0. 0.] -> ?     1    
        [1. 0. 1.] -> ?     0    
        [1. 1. 0.] -> ?     0    
        [1. 1. 1.] -> ?     1    


In [34]:
#
# for NNets, it's important to keep the feature values near 0, say -1. to 1. or so
#    This is done through the "StandardScaler" in scikit-learn
# 
from sklearn.preprocessing import StandardScaler

USE_SCALER = True   # this variable is important! It tracks if we need to use the scaler...

# we "train the scaler"  (computes the mean and standard deviation)
if USE_SCALER == True:
    scaler = StandardScaler()
    scaler.fit(X_train)  # Scale with the training data! ave becomes 0; stdev becomes 1
else:
    # this one does no scaling!  
    #
    # Its inputs are saying "don't find the mean, don't use the st deviation..."
    #
    # We still create it to be consistent...
    scaler = StandardScaler(copy=True, with_mean=False, with_std=False)
    scaler.fit(X_train)  # still need to fit, though it does not change...

scaler   # is now defined and ready to use...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Here are our scaled training and testing sets:

X_train_scaled = scaler.transform(X_train) # scale!
X_test_scaled = scaler.transform(X_test) # scale!

y_train_scaled = y_train  # the predicted/desired labels are not scaled
y_test_scaled = y_test  # not using the scaler
    
ascii_table(X_train_scaled,y_train_scaled)

#
# Note that the zeros have become -1's
# and the 1's have stayed 1's
#

            input  -> pred  des. 
     [-1. -1. -1.] -> ?     0    
     [-1. -1.  1.] -> ?     1    
     [-1.  1. -1.] -> ?     1    
     [-1.  1.  1.] -> ?     0    
     [ 1. -1. -1.] -> ?     1    
     [ 1. -1.  1.] -> ?     0    
     [ 1.  1. -1.] -> ?     0    
        [1. 1. 1.] -> ?     1    


In [35]:
#
# import our NNet library (within scikit-learn)
#
from sklearn.neural_network import MLPClassifier

#
# Here's where you can change the number of layers, neurons, and other parameters:
#
nn_classifier = MLPClassifier(hidden_layer_sizes=(6,7),  # 3 input -> 6 -> 7 -> 1 output
                    max_iter=500,      # how many times to train
                    activation="tanh", # the "activation function" input -> output
                    solver='sgd',      # the algorithm for optimizing weights
                    verbose=True,      # False to "mute" the training
                    shuffle=True,      # reshuffle the training epochs?
                    random_state=None, # set for reproduceability
                    learning_rate_init=.1,       # learning rate: % of error to backprop
                    learning_rate = 'adaptive')  # soften feedback as it converges

# documentation:
# scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html 
#     Try verbose / activation "relu" / other network sizes ...

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_classifier.fit(X_train_scaled, y_train_scaled)
print("\n++++++++++  TRAINING:   end  +++++++++++++++")
print(f"The analog prediction error (the loss) is {nn_classifier.loss_}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 0.80358163
Iteration 2, loss = 0.74313317
Iteration 3, loss = 0.68564791
Iteration 4, loss = 0.64628144
Iteration 5, loss = 0.62693421
Iteration 6, loss = 0.61899236
Iteration 7, loss = 0.61152413
Iteration 8, loss = 0.59750454
Iteration 9, loss = 0.57530704
Iteration 10, loss = 0.54724162
Iteration 11, loss = 0.51702695
Iteration 12, loss = 0.48740955
Iteration 13, loss = 0.45900222
Iteration 14, loss = 0.43070689
Iteration 15, loss = 0.40102397
Iteration 16, loss = 0.36916273
Iteration 17, loss = 0.33532431
Iteration 18, loss = 0.30030252
Iteration 19, loss = 0.26498150
Iteration 20, loss = 0.23014835
Iteration 21, loss = 0.19661043
Iteration 22, loss = 0.16531936
Iteration 23, loss = 0.13723116
Iteration 24, loss = 0.11297793
Iteration 25, loss = 0.09268563
Iteration 26, loss = 0.07606552
Iteration 27, loss = 0.06262039
Iteration 28, loss = 0.05180967
Iteration 29, loss = 0.04313583
Iteration 30, loss = 0.03617481

In [36]:
#
# how did it do on the data?    (Here, remember, training == testing: we're cheating!)   
#

def ascii_table_for_classifier(Xsc,y,nn,scaler):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(Xsc)            # all predictions
    prediction_probs = nn.predict_proba(Xsc) # all prediction probabilities
    Xpr = scaler.inverse_transform(Xsc)      # Xpr is the "X to print": unscaled data!
    # count correct
    num_correct = 0
    # printing
    print(f"{'input ':>18s} -> {'pred':^6s} {'des.':^6s}") 
    for i in range(len(y)):
        pred = predictions[i]
        pred_probs = prediction_probs[i,:]
        desired = y[i]
        if pred != desired: result = "  incorrect: " + str(pred_probs)
        else: result = "  correct"; num_correct += 1
        # Xpr = Xsc  # if you want to see the scaled versions
        print(f"{Xpr[i,:]!s:>18s} -> {pred:^6.0f} {desired:^6.0f} {result:^10s}") 
    print(f"\ncorrect predictions: {num_correct} out of {len(y)}")
    
#
# let's see how it did on the test data (also the training data!)
#
ascii_table_for_classifier(X_test_scaled,
                           y_test_scaled,
                           nn_classifier,
                           scaler)   


            input  ->  pred   des. 
        [0. 0. 0.] ->   0      0      correct 
        [0. 0. 1.] ->   1      1      correct 
        [0. 1. 0.] ->   1      1      correct 
        [0. 1. 1.] ->   0      0      correct 
        [1. 0. 0.] ->   1      1      correct 
        [1. 0. 1.] ->   0      0      correct 
        [1. 1. 0.] ->   0      0      correct 
        [1. 1. 1.] ->   1      1      correct 

correct predictions: 8 out of 8


In [37]:
#
# More rarely, we will want to see the neurons' weights and other details:
#

nn = nn_classifier  # less to type...
print("\n\n+++++ parameters, weights, etc. +++++\n")
print(f"\nweights/coefficients:\n")
for wts in nn.coefs_:
    print(wts)
print(f"\nintercepts: {nn.intercepts_}")
print(f"\nall parameters: {nn.get_params()}")



+++++ parameters, weights, etc. +++++


weights/coefficients:

[[ 1.21382934 -1.53652386  1.34796333 -0.5727733  -1.15148375 -0.86964985]
 [ 1.52802173  1.7472659  -0.03630473 -0.92608976 -0.24494094 -0.31015794]
 [-1.39840348 -1.84887153 -0.15291766  0.78774776 -0.15461596 -0.1718161 ]]
[[ 0.58721342  1.26616396  0.15941477 -0.19511234  1.39773185 -0.44373931
  -0.72713351]
 [-0.99518225 -1.01183639 -0.91222726  0.22893017 -1.59768739  0.46979773
   0.75892451]
 [ 0.17829857 -0.86614084 -0.97028741 -0.67555495 -0.83876894  0.53608387
   0.12439031]
 [ 0.13516531  0.0488237  -0.96040747 -0.1290118  -0.49391318  0.24074678
   0.38373951]
 [ 0.03807796  0.27267073 -0.13207998 -0.2187538   0.69205944 -0.2272655
  -0.54530881]
 [ 0.79100444  0.27032176  0.29656529 -0.1658389   0.6867491  -0.09457431
   0.25564488]]
[[ 1.1514851 ]
 [ 1.93992299]
 [ 1.28994139]
 [-0.10097596]
 [ 2.75222561]
 [-0.91049168]
 [-1.06650915]]

intercepts: [array([ 0.01208356,  0.01901037, -0.41008127, -0.131676

In [38]:
#
# we have a predictive model!  Let's try it out...
#
def make_prediction( Features, nn, scaler ):
    """ uses nn for predictions """
    print("input features are", Features)
    #  we make sure Features has the right shape (list-of-lists)
    row = np.array( [Features] )  # makes an array-row
    row = scaler.transform(row)   # scale according to scaler
    print("nn.predict_proba == ", nn.predict_proba(row))   # probabilities of each
    prediction = nn.predict(row)  # max!
    return prediction
    
# our features -- note that the inputs don't have to be bits!
Features = [ 1, 0, 1 ]      # whatever we'd like to test - need not be binary at all!
prediction = make_prediction(Features, nn_classifier, scaler)
print(f"prediction: {prediction}")   # just takes the max

input features are [1, 0, 1]
nn.predict_proba ==  [[0.99660997 0.00339003]]
prediction: [0.]


In [39]:
#
# MLPRegressor predicts _floating-point_ outputs
#

from sklearn.neural_network import MLPRegressor

nn_regressor = MLPRegressor(hidden_layer_sizes=(6,7), 
                    max_iter=200,          # how many training epochs
                    activation="tanh",     # the activation function
                    solver='sgd',          # the optimizer
                    verbose=True,          # do we want to watch as it trains?
                    shuffle=True,          # shuffle each epoch?
                    random_state=None,     # use for reproducibility
                    learning_rate_init=.1, # how much of each error to back-propagate
                    learning_rate = 'adaptive')  # how to handle the learning_rate

print("\n\n++++++++++  TRAINING:  begin  +++++++++++++++\n\n")
nn_regressor.fit(X_train_scaled, y_train_scaled)
print("++++++++++  TRAINING:   end  +++++++++++++++")

print(f"The (squared) prediction error (the loss) is {nn_regressor.loss_}")
print(f"And, its square root: {nn_regressor.loss_ ** 0.5}")



++++++++++  TRAINING:  begin  +++++++++++++++


Iteration 1, loss = 0.40816949
Iteration 2, loss = 0.22557962
Iteration 3, loss = 0.15897929
Iteration 4, loss = 0.14989194
Iteration 5, loss = 0.15522071
Iteration 6, loss = 0.15797370
Iteration 7, loss = 0.15308857
Iteration 8, loss = 0.14175494
Iteration 9, loss = 0.12827512
Iteration 10, loss = 0.11664322
Iteration 11, loss = 0.10867651
Iteration 12, loss = 0.10417694
Iteration 13, loss = 0.10193327
Iteration 14, loss = 0.10056177
Iteration 15, loss = 0.09895995
Iteration 16, loss = 0.09648523
Iteration 17, loss = 0.09295917
Iteration 18, loss = 0.08855738
Iteration 19, loss = 0.08364507
Iteration 20, loss = 0.07861295
Iteration 21, loss = 0.07374884
Iteration 22, loss = 0.06916507
Iteration 23, loss = 0.06479600
Iteration 24, loss = 0.06046830
Iteration 25, loss = 0.05601735
Iteration 26, loss = 0.05139166
Iteration 27, loss = 0.04668579
Iteration 28, loss = 0.04208702
Iteration 29, loss = 0.03777911
Iteration 30, loss = 0.03386973

In [40]:
#
# how did it do? We're making progress ... by regressing:
#
def ascii_table_for_regressor(Xsc,y,nn,scaler):
    """ a table including predictions using nn.predict """
    predictions = nn.predict(Xsc) # all predictions
    Xpr = scaler.inverse_transform(Xsc)  # Xpr is the "X to print": unscaled data!
    # measure error
    total_error = 0.0
    # printing
    print(f"{'input ':>18s} ->  {'pred':^6s}  {'desr':^6s}  {'absdiff':^10s}") 
    for i in range(len(y)):
        pred = predictions[i]
        desired = y[i]
        result = abs(desired - pred)
        total_error += result   # add up the errors
        # Xpr = Xsc   # if you'd like to see the scaled values
        print(f"{Xpr[i,:]!s:>18s} ->  {pred:<+6.3f}  {desired:<+6.3f}  {result:^10.3f}") 
    print(f"\naverage absolute error: {total_error/len(y)}")
    
#
# let's see how it did on the test data (also the training data!)
#
ascii_table_for_regressor(X_test_scaled,
                          y_test_scaled,
                          nn_regressor,
                          scaler)   # this is our own f'n, above


            input  ->   pred    desr    absdiff  
        [0. 0. 0.] ->  -0.033  +0.000    0.033   
        [0. 0. 1.] ->  +1.044  +1.000    0.044   
        [0. 1. 0.] ->  +1.013  +1.000    0.013   
        [0. 1. 1.] ->  -0.008  +0.000    0.008   
        [1. 0. 0.] ->  +1.004  +1.000    0.004   
        [1. 0. 1.] ->  -0.004  +0.000    0.004   
        [1. 1. 0.] ->  +0.005  +0.000    0.005   
        [1. 1. 1.] ->  +0.985  +1.000    0.015   

average absolute error: 0.015785068630739894


In [41]:
#
# other details we might want to see for the regressor... (less often)
#
nn = nn_regressor  # less to type?
print("\n\n+++++ parameters, weights, etc. +++++\n")
print(f"\nweights/coefficients:\n")
for wts in nn.coefs_:
    print(wts)
print(f"\nintercepts: {nn.intercepts_}")
print(f"\nall parameters: {nn.get_params()}")



+++++ parameters, weights, etc. +++++


weights/coefficients:

[[ 0.67234027 -0.15541633 -0.48714159 -0.29596586 -0.6629438   0.84078248]
 [ 0.79441176 -0.00594859 -0.23616109  0.30008869 -0.94297981  0.72750852]
 [-0.65395898  0.63055687  0.18673944 -0.48559702  0.77141119  0.20476742]]
[[ 5.72456371e-01 -2.45249926e-01 -8.76576481e-01  6.88029105e-02
  -8.71301179e-02  2.09262955e-01 -7.53743516e-02]
 [ 4.83404322e-01  1.51198668e-01  9.27433727e-01  5.16873045e-01
  -1.99173435e-01  3.20704077e-04 -1.72875316e-01]
 [ 7.06327326e-02  3.26529000e-02  3.59277292e-01  2.84301763e-03
  -3.87523910e-01  1.40775314e-01 -8.38017819e-01]
 [-6.88574265e-01  1.05831474e-01  9.69980318e-02 -4.59445920e-01
  -5.88829726e-01 -5.74059078e-01  2.05473072e-01]
 [ 2.93135316e-01  1.48655930e-01  9.61826462e-01 -4.71280819e-01
   3.87037866e-02  1.17475796e-01 -1.18068391e-01]
 [ 6.48279326e-01 -8.10026091e-01 -4.10578364e-01 -6.26260476e-01
   2.16497832e-01 -3.66611124e-01 -1.10505814e-02]]
[[ 0.3

In [42]:
#
# again, we have a predictive model. This time, it's a regressor.  Let's try it out...
#

def make_prediction( Features, nn, scaler ):
    """ use a NNet regressor to make a prediction """
    print("input features are", Features)
    row = np.array( [Features] )  # a list-of-lists-style input is needed
    row = scaler.transform(row)   # scale!
    prediction = nn.predict(row)
    #print("nn.predict yields ", prediction)
    return prediction
    
# our features
Features = [ 1.0, 0.0, 1.0 ]     # can vary these to be anything!
prediction = make_prediction(Features, nn_regressor, scaler)
print(f"prediction: {prediction}")

input features are [1.0, 0.0, 1.0]
prediction: [-0.0041837]
