In [None]:
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.utils import compute_sample_weight
import pandas as pd
import numpy as np
import mlrose as ml
from collections import defaultdict
from time import clock
import sklearn.model_selection as ms


In [None]:
# GET THE DATA ============

# wine data
file_path ="./data/"
wine = pd.read_csv (file_path+'wines.csv', sep =",")

# Break out predicting and target variable data    
wineX = wine.drop('quality',1).copy().values
wineY = wine['quality'].copy().values


In [None]:
#check the data
wineX.shape, wineY.shape

In [None]:
# DIVIDE INTO TRAIN AND TEST SETS  
wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(
    wineX, wineY, test_size=0.3, random_state=0,stratify=wineY)     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# scale the data
scaler = MinMaxScaler()

X_train = scaler.fit_transform(wine_trgX)
X_test = scaler.transform(wine_tstX)
y_train = scaler.fit_transform(wine_trgY.reshape(-1, 1))
y_test = scaler.transform(wine_tstY.reshape(-1, 1))


In [None]:
#check the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train

In [None]:
# find paramter values to iterate thru
#alphas = [10**-x for x in np.arange(1,4.01,1)]
alphas = [0.1, 0.01]
dw = wineX.shape[1]
#hiddens_wine = [[h,]*l for l in [1,2] for h in [dw,dw//2,round(dw/4),dw*2]]
hiddens = [[12], [6], [6,6]]
paramdict = {'activation':['relu'],'learning_rate':alphas,'hidden_nodes':hiddens, 
              'max_iters':[100], 'clip_max':[3,5], 'max_attempts': [10], 
             'pop_size': [100, 300], 'mutation_prob':[0.1, 0.3, 0.8]
            }


In [None]:
# implementing a brute force "homemade" GridSearch
import mlrose as ml
from itertools import combinations
from sklearn.metrics import accuracy_score
import itertools

Genetic_Reg = []

# get all the different combinations of parameters
keys, values = zip(*paramdict.items())
params = [dict(zip(keys, v)) for v in itertools.product(*values)]

i=0
while i in range (0, len(params)):
                act = params[i]['activation']
                node = params[i]['hidden_nodes']
                learn = params[i]['learning_rate']
                times = params[i]['max_iters']
                clip = params[i]['clip_max']
                attempt = params[i]['max_attempts']
                popsize = params[i]['pop_size']
                mutants = params[i]['mutation_prob']
                algo = 'genetic_alg' 
                
                print('Starting experiment {} of {}...'.format(i,len(params)))
                #print('Parameters of experiment {} are:'.format(i, params[i]))
                #print(params[i])
                
                np.random.seed(55)

                nn_rose = ml.NeuralNetwork(hidden_nodes = node, activation = act, \
                         algorithm = algo, max_iters = times, pop_size = popsize, 
                         mutation_prob = mutants, bias = False, is_classifier = True, learning_rate = learn, \
                         early_stopping = True, clip_max = clip, max_attempts = attempt)
                
                # start the clock - training
                st = clock()
                # fit the model
                nn_rose.fit(X_train, y_train)
                
                # Predict labels for train set and assess accuracy
                y_train_pred = nn_rose.predict(X_train)
                train_accuracy = accuracy_score(y_train, y_train_pred)
                # stop train time, save time elapsed
                train_time = clock()-st
                
                print('Training Accuracy of experiment {} is {}'.format(i,round(train_accuracy, 6)))
                print('Time to train = {}'.format(train_time))
                
                # start the clock - testing
                st = clock()
                # Predict labels for test set and assess accuracy
                y_test_pred = nn_rose.predict(X_test)
                test_accuracy = accuracy_score(y_test, y_test_pred)
                # stop test clock and save time elapsed
                test_time= clock()-st
                print('Test Accuracy of experiment {} is {}'.format(i,round(test_accuracy, 6)))
                print('Time to test = {}'.format(test_time))
                
                print('--------')

                results = (i, train_time, test_time, train_accuracy, test_accuracy, act, node, learn, 
                           times, clip, attempt, popsize, mutants, nn_rose.loss, nn_rose.node_list)
                Genetic_Reg.append(results)
                  
                


                # iterate  
                i+=1

#save the results and export
regTable = pd.DataFrame(Genetic_Reg)
# rename the columns
regTable.columns = ['Run_no', 'train_time', 'test_time', 'train_accuracy', 'test_accuracy', 'activation', 
                    'hidden_nodes', 'learning_rate', 'max_iter', 'clip_max', 'max_attempts', 'pop_size', 
                    'mutation_prob','loss', 'node_list']
regTable.to_csv('./output/{}_{}_reg.csv'.format("Genetic","wine"),index=False)
                        

In [None]:
# Manual process - examine CSV file for best model
# see graphs

# regTable['test_accuracy'].max()
# bestacc = regTable.loc[regTable['test_accuracy'].idxmax()]

In [None]:
# GENETIC - Scatterplots (the bar plots may work better for some of these)
# Time vs Accuracy for different values of learn rate, population, and mutation prob ===================

import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap

# style
plt.style.use('seaborn-darkgrid')
# create a color palette
palette = plt.get_cmap('Set1')


models = ['Genetic']
datasets = ['wine'] 
params = ['learning_rate', 'pop_size', 'mutation_prob', 'hidden_nodes', 'clip_max', 'max_attempts']
img_name = ['learn_rate', 'pop_size', 'mutate_prob', 'hidden_nodes', 'clip_max', 'max_attempts']
axis = ["Learning Rate", 'Population Size', "Mutation Probability", "Hidden Layers", "Clip", "Max Attempts"]

for model in models:
    for ds in datasets:
        for i in range(0, len(params)):
            param = params[i]
            # get the data
            file = './output/'+model+'_'+ds+'_reg.csv' 
            reg = pd.read_csv (file, sep =",")
            
            # Use the 'hue' argument to provide a factor variable
            sns.lmplot( x="train_time", y="test_accuracy", data=reg, fit_reg=False, hue=param, 
                       legend=False)
 
            # Move the legend to an empty part of the plot
            plt.legend(loc='lower right')
            plt.title('Train Time & Accuracy of GA Models by '+axis[i]+' ', loc='center', fontsize=12, fontweight=0, color='darkblue')
            plt.xlabel('Train Time per Iteration (in seconds)')
            plt.ylabel('Test Accuracy')
            plt.ylim(ymax = .9, ymin =0)
            plt.legend(loc='best', ncol=2, frameon=True) #, framealpha=2.0)
            plt.xticks(rotation=0)
            plt.savefig('./images/'+model+'_time_vs_acc _'+img_name[i]+'.png')
            plt.show()
            #plt.close()
            i+=1


In [None]:
# look at accuracy with 'best' parameters, using different max iterations
def mlrose_LC_iters(X,Y,clfName):

    out = defaultdict(dict)
    iterations = [10, 20, 30, 40, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700]
    for iters in iterations:
        X_train, X_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.2, random_state=55)
        # scale the data
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        y_train = scaler.fit_transform(y_train.reshape(-1, 1))
        y_test = scaler.transform(y_test.reshape(-1, 1))
        
        np.random.seed(55)
        nn_rose_iters = ml.NeuralNetwork(hidden_nodes = [6], 
                                activation = 'relu', 
                                algorithm = 'genetic_alg', 
                                max_iters = iters, 
                                pop_size = 100, 
                                mutation_prob = 0.3, 
                                bias = False, 
                                is_classifier = True, 
                                learning_rate = 0.1, 
                                early_stopping = False, 
                                clip_max = clip, 
                                max_attempts = 10)
        
        # start the clock - training
        st = clock()
        np.random.seed(55)
        nn_rose_iters.fit(X_train,y_train)
        # Predict labels for train set and assess accuracy
        y_train_pred = nn_rose_iters.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        # stop train time, save time elapsed
        out['train'][iters]= clock()-st
        out['train_acc'][iters] = train_accuracy
        
        # start the clock - testing
        st = clock()
        nn_rose_iters.predict(X_test)
        # Predict labels for test set and assess accuracy
        y_test_pred = nn_rose_iters.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        # stop test clock and save
        out['test'][iters]= clock()-st
        out['test_acc'][iters] = test_accuracy
        
        print(clfName,iters)
        print(test_accuracy)
        print(train_accuracy)
        print('------------')
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_iters_LC_2ndRun.csv'.format(clfName))
    return 

In [None]:
mlrose_LC_iters(wineX, wineY, clfName='Genetic')

In [None]:
# graph accuracy by iterations
models = ['Genetic']
datasets = ['wine'] #, 'credit']
#param = 'param_KNN__n_neighbors'
# params = ['learning_rate', 'pop_size', 'mutation_prob', 'hidden_nodes']
# img_name = ['learn_rate', 'pop_size', 'mutate_prob', 'hidden_nodes']
# axis = ["Learning Rate", 'Population Size', "Mutation Probability", "Hidden Layers"]

for model in models:
    for s in datasets:
        #for p in portions:
        timing = []
        
        # get data
        file_path3 ="./output/"
        file_name = model+'_iters_LC_2ndRun.csv' 
        
        timing = pd.read_csv (file_path3+file_name, sep =",")
        timing['iters']= timing['Unnamed: 0']
        
        # plot timing by fraction of data
        plt.plot( 'iters', 'train', data=timing, marker='o', markerfacecolor='blue', markersize=6, color='skyblue', linewidth=2)
        plt.plot( 'iters', 'test', data=timing, marker='o', markerfacecolor='red', color='orange', linewidth=2)
        #plt.plot( 'fraction', 'y3', data=timing, marker='', color='olive', linewidth=2, linestyle='dashed', label="toto")
        plt.title("Genetic Algorithm, Run Time by Iterations", loc='center', fontsize=12, fontweight=0, color='darkblue')
        plt.xlabel("Iterations")
        plt.ylabel("Run Time (in Seconds)")
        plt.legend()
        plt.savefig('images/'+model+'_time_iterations_zoom.png')
        plt.show()
        
        # plot accuracy by iterations
        plt.plot( 'iters', 'train_acc', data=timing, marker='o', markerfacecolor='blue', markersize=6, color='skyblue', linewidth=2)
        plt.plot( 'iters', 'test_acc', data=timing, marker='o', markerfacecolor='red', color='orange', linewidth=2)
        #plt.plot( 'fraction', 'y3', data=timing, marker='', color='olive', linewidth=2, linestyle='dashed', label="toto")
        plt.title("Genetic Algorithm, Accuracy by Iterations", loc='center', fontsize=12, fontweight=0, color='darkblue')
        plt.xlabel("Iterations")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.savefig('images/'+model+'_acc_iterations_zoom.png')
        plt.show()
        
        
        

In [None]:
# best paramters defined for genetic algorithm
np.random.seed(55)
nn_rose_best = ml.NeuralNetwork(hidden_nodes = [6], 
                                activation = 'relu', 
                                algorithm = 'genetic_alg', 
                                max_iters = 30, # 30 determined as best accuracy with least train time 
                                pop_size = 100, 
                                mutation_prob = 0.3, 
                                bias = False, 
                                is_classifier = True, 
                                learning_rate = 0.1, 
                                early_stopping = False, 
                                clip_max = clip, 
                                max_attempts = 10)
                

In [None]:
# define timing curve function

from time import clock
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict


def mlrose_TimingCurve(X,Y,clfName,dataset):

    out = defaultdict(dict)
    for frac in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:    
        X_train, X_test, y_train, y_test = ms.train_test_split(X, Y, test_size=frac, random_state=55)
        # scale the data
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        y_train = scaler.fit_transform(y_train.reshape(-1, 1))
        y_test = scaler.transform(y_test.reshape(-1, 1))
        
        # start the clock - training
        st = clock()
        np.random.seed(55)
        nn_rose_best.fit(X_train,y_train)
        # Predict labels for train set and assess accuracy
        y_train_pred = nn_rose_best.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        # stop train time, save time elapsed
        out['train'][frac]= clock()-st
        out['train_acc'][frac] = train_accuracy
        
        # start the clock - testing
        st = clock()
        nn_rose_best.predict(X_test)
        # Predict labels for test set and assess accuracy
        y_test_pred = nn_rose_best.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        # stop test clock and save
        out['test'][frac]= clock()-st
        out['test_acc'][frac] = test_accuracy
        
        print(clfName,dataset,frac)
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_{}_timing_2ndRun.csv'.format(clfName,dataset))
    return 

In [None]:
mlrose_TimingCurve(wineX, wineY, clfName='Genetic', dataset='wine')

In [None]:
# graph the timing curve
# Graph timing curve results by method ===========================================

# multiple line plot
# style
plt.style.use('seaborn-darkgrid')

# create a color palette
palette = plt.get_cmap('Set1')


# Create a loop and save the resulting graphs
models = ['Genetic']
datasets = ['wine']
#portions = ['training', 'test':

for model in models:
    for s in datasets:
        #for p in portions:
        timing = []
        
        # get data
        file_path3 ="./output/"
        file_name = model+'_'+s+'_timing.csv' 
        
        timing = pd.read_csv (file_path3+file_name, sep =",")
        timing['fraction']= timing['Unnamed: 0']
        #timing['sample size'] = df.apply (lambda row: timing['fraction']*6497 (row),axis=1)
        # convert frac of data to sample size
        timing['test samples'] = timing['fraction'].apply(lambda x: round(x*6497))
        
        
        # plot timing by fraction of data
        plt.plot( 'fraction', 'train', data=timing, marker='o', markerfacecolor='blue', markersize=6, color='skyblue', linewidth=2)
        plt.plot( 'fraction', 'test', data=timing, marker='o', markerfacecolor='red', color='orange', linewidth=2)
        #plt.plot( 'fraction', 'y3', data=timing, marker='', color='olive', linewidth=2, linestyle='dashed', label="toto")
        plt.title("Genetic Algorithm Timing Curve, Wine Data", loc='center', fontsize=12, fontweight=0, color='darkblue')
        plt.xlabel("Data Fraction Saved for Testing")
        plt.ylabel("Time (in Seconds)")
        plt.legend()
        plt.savefig('images/GeneticAlgo_TimingCurve_2ndrun.png')
        plt.show()
        
        # plot accuracy by fractions of data
        plt.plot( 'fraction', 'train_acc', data=timing, marker='o', markerfacecolor='blue', markersize=6, color='skyblue', linewidth=2)
        plt.plot( 'fraction', 'test_acc', data=timing, marker='o', markerfacecolor='red', color='orange', linewidth=2)
        #plt.plot( 'fraction', 'y3', data=timing, marker='', color='olive', linewidth=2, linestyle='dashed', label="toto")
        plt.title("Genetic Algorithm, Learning Curve by Data Fraction", loc='center', fontsize=12, fontweight=0, color='darkblue')
        plt.xlabel("Data Fraction Saved for Testing")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.savefig('images/GeneticAlgo_LC_datafrac_2ndrun.png')
        plt.show()
        
        # plot accuracy by sample size
        plt.plot( 'test samples', 'train_acc', data=timing, marker='o', markerfacecolor='blue', markersize=6, color='skyblue', linewidth=2)
        plt.plot( 'test samples', 'test_acc', data=timing, marker='o', markerfacecolor='red', color='orange', linewidth=2)
        #plt.plot( 'fraction', 'y3', data=timing, marker='', color='olive', linewidth=2, linestyle='dashed', label="toto")
        plt.title("Genetic Algorithm, Learning Curve by Test Set Size (n=6497)", loc='center', fontsize=12, fontweight=0, color='darkblue')
        plt.xlabel("Samples in Test Set")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.savefig('images/GeneticAlgo_LC_samplesize_2ndrun.png')
        plt.show()

        

In [None]:
# run algorithm with best parameters, best iters, and best test/train fraction

# best iters = 30
# best test frac = 0.3

def mlrose_GA_finalmodel(X,Y,clfName,dataset):
    out = defaultdict(dict)
    # split the data according to best data frac 
    X_train, X_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3, random_state=55)
    # scale the data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = scaler.fit_transform(y_train.reshape(-1, 1))
    y_test = scaler.transform(y_test.reshape(-1, 1))

    np.random.seed(55)
    nn_rose_final = ml.NeuralNetwork(hidden_nodes = [6], 
                            activation = 'relu', 
                            algorithm = 'genetic_alg', 
                            max_iters = 30, 
                            pop_size = 100, 
                            mutation_prob = 0.3, 
                            bias = False, 
                            is_classifier = True, 
                            learning_rate = 0.1, 
                            early_stopping = False, 
                            clip_max = 3, 
                            max_attempts = 10)

    # start the clock - training
    st = clock()
    np.random.seed(55)
    nn_rose_final.fit(X_train,y_train)
    # Predict labels for train set and assess accuracy
    y_train_pred = nn_rose_final.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    # stop train time, save time elapsed
    out['train_time'][0]= clock()-st
    out['train_acc'][0]= train_accuracy

    # start the clock - testing
    st = clock()
    nn_rose_final.predict(X_test)
    # Predict labels for test set and assess accuracy
    y_test_pred = nn_rose_final.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    # stop test clock and save
    out['test_time'][0]= clock()-st
    out['test_acc'][0] = test_accuracy

    print(clfName, "final model")
    print(test_accuracy)
    print(train_accuracy)
    print('------------')
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_final.csv'.format(clfName))
    return


In [None]:
mlrose_GA_finalmodel(wineX, wineY, clfName='Genetic', dataset='wine')