In [None]:
"""
Edited to fit needs of my data & analysis
Jamie Andrews
Feb 3 2019

"""
"""
Created on Fri Jan 20 13:55:38 2017
@author: JTay
"""
import numpy as np
from time import clock
import sklearn.model_selection as ms
import pandas as pd
from collections import defaultdict
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.utils import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier as dtclf


def balanced_accuracy(truth,pred):
    wts = compute_sample_weight('balanced',truth)
    return accuracy_score(truth,pred,sample_weight=wts)

scorer = make_scorer(balanced_accuracy)    
    
def basicResults_wine(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    # cv set to 3 for wine data to insure n is suffciently large with each cross cut
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=3,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/{}_{}_reg.csv'.format(clf_type,dataset),index=False)
    test_score = cv.score(tstX,tstY)
    with open('./output/test results.csv','a') as f:
        f.write('{},{},{},{}\n'.format(clf_type,dataset,test_score,cv.best_params_))    
    N = trgY.shape[0]    
    curve = ms.learning_curve(cv.best_estimator_,trgX,trgY,cv=3,
                              train_sizes=[100,250]+[int(N*x/10) for x in range(1,7)],verbose=10,scoring=scorer)
    curve_train_scores = pd.DataFrame(index = curve[0],data = curve[1])
    curve_test_scores  = pd.DataFrame(index = curve[0],data = curve[2])
    curve_train_scores.to_csv('./output/{}_{}_LC_train.csv'.format(clf_type,dataset))
    curve_test_scores.to_csv('./output/{}_{}_LC_test.csv'.format(clf_type,dataset))
    return cv

def basicResults_credit(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    # cv =5 for credit data bc there is more data to sample from
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=5,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/{}_{}_reg.csv'.format(clf_type,dataset),index=False)
    test_score = cv.score(tstX,tstY)
    with open('./output/test results.csv','a') as f:
        f.write('{},{},{},{}\n'.format(clf_type,dataset,test_score,cv.best_params_))    
    N = trgY.shape[0]    
    curve = ms.learning_curve(cv.best_estimator_,trgX,trgY,cv=5,
                              train_sizes=[250,500,1000]+[int(N*x/10) for x in range(1,7)],verbose=10,scoring=scorer)
    curve_train_scores = pd.DataFrame(index = curve[0],data = curve[1])
    curve_test_scores  = pd.DataFrame(index = curve[0],data = curve[2])
    curve_train_scores.to_csv('./output/{}_{}_LC_train.csv'.format(clf_type,dataset))
    curve_test_scores.to_csv('./output/{}_{}_LC_test.csv'.format(clf_type,dataset))
    return cv
    
def iterationLC_wine(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=3,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/ITER_base_{}_{}.csv'.format(clf_type,dataset),index=False)
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:        
        d['param_{}'.format(name)].append(value)
        clfObj.set_params(**{name:value})
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(trgX)
        d['train acc'].append(balanced_accuracy(trgY,pred))
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(tstX)
        d['test acc'].append(balanced_accuracy(tstY,pred))
        print(value)
    d = pd.DataFrame(d)
    d.to_csv('./output/ITERtestSET_{}_{}.csv'.format(clf_type,dataset),index=False)
    return cv    

def iterationLC_credit(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=5,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/ITER_base_{}_{}.csv'.format(clf_type,dataset),index=False)
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:        
        d['param_{}'.format(name)].append(value)
        clfObj.set_params(**{name:value})
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(trgX)
        d['train acc'].append(balanced_accuracy(trgY,pred))
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(tstX)
        d['test acc'].append(balanced_accuracy(tstY,pred))
        print(value)
    d = pd.DataFrame(d)
    d.to_csv('./output/ITERtestSET_{}_{}.csv'.format(clf_type,dataset),index=False)
    return cv    
    
def add_noise(y,frac=0.1):
    np.random.seed(456)
    n = y.shape[0]
    sz = int(n*frac)
    ind = np.random.choice(np.arange(n),size=sz,replace=False)
    tmp = y.copy()
    tmp[ind] = 1-tmp[ind]
    return tmp
    
    
def makeTimingCurve(X,Y,clf,clfName,dataset):
    out = defaultdict(dict)
    for frac in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:    
        X_train, X_test, y_train, y_test = ms.train_test_split(X, Y, test_size=frac, random_state=42)
        st = clock()
        np.random.seed(55)
        clf.fit(X_train,y_train)
        out['train'][frac]= clock()-st
        st = clock()
        clf.predict(X_test)
        out['test'][frac]= clock()-st
        print(clfName,dataset,frac)
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_{}_timing.csv'.format(clfName,dataset))
    return 


In [None]:
# %load ANN.py
"""
Created on Fri Jan 20 14:23:40 2017
@author: JTay
"""

import numpy as np
from sklearn.neural_network import MLPClassifier
import sklearn.model_selection as ms
import pandas as pd
from helpers_jamie import  basicResults_wine,makeTimingCurve,iterationLC_wine, iterationLC_credit, basicResults_credit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# for assignment 2

import sklearn
import numpy as np
import mlrose as ml
import sklearn.model_selection as ms



In [None]:
# GET THE DATA ============

# wine data
file_path ="./data/"
wine = pd.read_csv (file_path+'wines.csv', sep =",")
# credit data
file_path2 ="./data/"
credit = pd.read_csv (file_path2+'credit.csv', sep =",")

# Break out predicting and target variable data    
wineX = wine.drop('quality',1).copy().values
wineY = wine['quality'].copy().values

creditX = credit.drop('default',1).copy().values
creditY = credit['default'].copy().values



In [None]:
# DIVIDE INTO TRAIN AND TEST SETS  
wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(
    wineX, wineY, test_size=0.3, random_state=0,stratify=wineY)     

credit_trgX, credit_tstX, credit_trgY, credit_tstY = ms.train_test_split(
    creditX, creditY, test_size=0.3, random_state=0,stratify=creditY)   


In [None]:
#check the data
creditX.shape, creditY.shape, wineX.shape, wineY.shape

In [None]:
# build the pipelines

pipeC = Pipeline([('Scale',StandardScaler()),
                 ('MLP',MLPClassifier(max_iter=2000,early_stopping=True,random_state=55))])

pipeW = Pipeline([('Scale',StandardScaler()),
                  ('Hill Climb',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
#                  ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
#                  ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                  ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
                 ('MLP',MLPClassifier(max_iter=2000,early_stopping=True,random_state=55))])




In [None]:
# find paramter values to iterate thru
alphas = [10**-x for x in np.arange(-1,5.01,1/2)]
d = creditX.shape[1]
hiddens_credit = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]]


alphasW = [10**-x for x in np.arange(-1,9.01,1/2)]
dw = wineX.shape[1]
hiddens_wine = [(h,)*l for l in [1,2,3] for h in [dw,dw//2,round(dw/4),dw*2]]


params_credit = {'MLP__activation':['relu','logistic'],'MLP__alpha':alphas,'MLP__hidden_layer_sizes':hiddens_credit}
params_wine = {'MLP__activation':['relu','logistic'],'MLP__alpha':alphasW,'MLP__hidden_layer_sizes':hiddens_wine}



In [None]:
dw = wineX.shape[1]
#dw = dw//(2**4)
dw
hiddens_wine = [(h,)*l for l in [1,2,3] for h in [dw,dw//2,round(dw/4),dw*2]]
hiddens_wine
dw//(2**4)
hiddens_credit, hiddens_wine

In [None]:
# fit models with different paramter values to find best fit
#wine
wine_clf = basicResults_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'ANN','wine')  
 




In [None]:
# credit
credit_clf = basicResults_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,params_credit,'ANN','credit')       

In [None]:
# define final params
wine_final_params = wine_clf.best_params_
credit_final_params =credit_clf.best_params_

In [None]:
print(wine_final_params, credit_final_params )

In [None]:
# update OF params with alphas from Final(?) Params(?)

credit_OF_params =credit_final_params.copy()
credit_OF_params['MLP__alpha'] = 0
wine_OF_params =wine_final_params.copy()
wine_OF_params['MLP__alpha'] = 0

#wine_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0}
#credit_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794}



In [None]:
# Timing Curve - wine
pipeW.set_params(**wine_final_params)  
pipeW.set_params(**{'MLP__early_stopping':False})                   
makeTimingCurve(wineX,wineY,pipeW,'ANN','wine')

In [None]:
# Timing Curve - credit
pipeC.set_params(**credit_final_params)
pipeC.set_params(**{'MLP__early_stopping':False})                  
makeTimingCurve(creditX,creditY,pipeC,'ANN','credit')

In [None]:
# Learning Curve - wine
pipeW.set_params(**wine_final_params)
pipeW.set_params(**{'MLP__early_stopping':False})               
iterationLC_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,
            {'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','wine')    

In [None]:
# Learning Curve - credit
pipeC.set_params(**credit_final_params)
pipeC.set_params(**{'MLP__early_stopping':False})                  
iterationLC_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','credit')                


In [None]:
# Learning Curve of OF Params - wine
pipeW.set_params(**wine_OF_params)
pipeW.set_params(**{'MLP__early_stopping':False})                  
iterationLC_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN_OF','wine')        



In [None]:
# Learning Curve of OF params - 
pipeC.set_params(**credit_OF_params)
pipeC.set_params(**{'MLP__early_stopping':False})               
iterationLC_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN_OF','credit')                
