In [None]:
# bring in python file
# %load helper.py

"""
Originally Created on Fri Jan 20 13:55:38 2017
@author: JTay
source = https://github.com/JonathanTay/CS-7641-assignment-1

Altered by Jamie Andrews
February 2019
CS7461 Assignment 1

"""
import numpy as np
from time import clock
import sklearn.model_selection as ms
import pandas as pd
from collections import defaultdict
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.utils import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier as dtclf


def balanced_accuracy(truth,pred):
    wts = compute_sample_weight('balanced',truth)
    return accuracy_score(truth,pred,sample_weight=wts)

scorer = make_scorer(balanced_accuracy)    
    
def basicResults_wine(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    # cv set to 3 for wine data to insure n is suffciently large with each cross cut
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=3,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/{}_{}_reg.csv'.format(clf_type,dataset),index=False)
    test_score = cv.score(tstX,tstY)
    with open('./output/test results.csv','a') as f:
        f.write('{},{},{},{}\n'.format(clf_type,dataset,test_score,cv.best_params_))    
    N = trgY.shape[0]    
    curve = ms.learning_curve(cv.best_estimator_,trgX,trgY,cv=3,
                              train_sizes=[100,250]+[int(N*x/10) for x in range(1,7)],verbose=10,scoring=scorer)
    curve_train_scores = pd.DataFrame(index = curve[0],data = curve[1])
    curve_test_scores  = pd.DataFrame(index = curve[0],data = curve[2])
    curve_train_scores.to_csv('./output/{}_{}_LC_train.csv'.format(clf_type,dataset))
    curve_test_scores.to_csv('./output/{}_{}_LC_test.csv'.format(clf_type,dataset))
    return cv

def basicResults_credit(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    # cv =5 for credit data bc there is more data to sample from
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=5,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/{}_{}_reg.csv'.format(clf_type,dataset),index=False)
    test_score = cv.score(tstX,tstY)
    with open('./output/test results.csv','a') as f:
        f.write('{},{},{},{}\n'.format(clf_type,dataset,test_score,cv.best_params_))    
    N = trgY.shape[0]    
    curve = ms.learning_curve(cv.best_estimator_,trgX,trgY,cv=5,
                              train_sizes=[250,500,1000]+[int(N*x/10) for x in range(1,7)],verbose=10,scoring=scorer)
    curve_train_scores = pd.DataFrame(index = curve[0],data = curve[1])
    curve_test_scores  = pd.DataFrame(index = curve[0],data = curve[2])
    curve_train_scores.to_csv('./output/{}_{}_LC_train.csv'.format(clf_type,dataset))
    curve_test_scores.to_csv('./output/{}_{}_LC_test.csv'.format(clf_type,dataset))
    return cv
    
def iterationLC_wine(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=3,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/ITER_base_{}_{}.csv'.format(clf_type,dataset),index=False)
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:        
        d['param_{}'.format(name)].append(value)
        clfObj.set_params(**{name:value})
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(trgX)
        d['train acc'].append(balanced_accuracy(trgY,pred))
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(tstX)
        d['test acc'].append(balanced_accuracy(tstY,pred))
        print(value)
    d = pd.DataFrame(d)
    d.to_csv('./output/ITERtestSET_{}_{}.csv'.format(clf_type,dataset),index=False)
    return cv    

def iterationLC_credit(clfObj,trgX,trgY,tstX,tstY,params,clf_type=None,dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    cv = ms.GridSearchCV(clfObj,n_jobs=1,param_grid=params,refit=True,verbose=10,cv=5,scoring=scorer)
    cv.fit(trgX,trgY)
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/ITER_base_{}_{}.csv'.format(clf_type,dataset),index=False)
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:        
        d['param_{}'.format(name)].append(value)
        clfObj.set_params(**{name:value})
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(trgX)
        d['train acc'].append(balanced_accuracy(trgY,pred))
        clfObj.fit(trgX,trgY)
        pred = clfObj.predict(tstX)
        d['test acc'].append(balanced_accuracy(tstY,pred))
        print(value)
    d = pd.DataFrame(d)
    d.to_csv('./output/ITERtestSET_{}_{}.csv'.format(clf_type,dataset),index=False)
    return cv    
    
def add_noise(y,frac=0.1):
    np.random.seed(456)
    n = y.shape[0]
    sz = int(n*frac)
    ind = np.random.choice(np.arange(n),size=sz,replace=False)
    tmp = y.copy()
    tmp[ind] = 1-tmp[ind]
    return tmp
    
    
def makeTimingCurve(X,Y,clf,clfName,dataset):
    out = defaultdict(dict)
    for frac in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:    
        X_train, X_test, y_train, y_test = ms.train_test_split(X, Y, test_size=frac, random_state=42)
        st = clock()
        np.random.seed(55)
        clf.fit(X_train,y_train)
        out['train'][frac]= clock()-st
        st = clock()
        clf.predict(X_test)
        out['test'][frac]= clock()-st
        print(clfName,dataset,frac)
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_{}_timing.csv'.format(clfName,dataset))
    return 
        
        
    
    
    
class dtclf_pruned(dtclf):        
    def remove_subtree(self,root):
        '''Clean up'''
        tree = self.tree_
        visited,stack= set(),[root]
        while stack:
            v = stack.pop()
            visited.add(v)
            left =tree.children_left[v]
            right=tree.children_right[v]
            if left >=0:
                stack.append(left)
            if right >=0:
                stack.append(right)
        for node in visited:
            tree.children_left[node] = -1
            tree.children_right[node] = -1
        return 
        
    def prune(self):      
        C = 1-self.alpha
        if self.alpha <= -1: # Early exit
            return self
        tree = self.tree_        
        bestScore = self.score(self.valX,self.valY)        
        candidates = np.flatnonzero(tree.children_left>=0)
        for candidate in reversed(candidates): # Go backwards/leaves up
            if tree.children_left[candidate]==tree.children_right[candidate]: # leaf node. Ignore
                continue
            left = tree.children_left[candidate]
            right = tree.children_right[candidate]
            tree.children_left[candidate]=tree.children_right[candidate]=-1            
            score = self.score(self.valX,self.valY)
            if score >= C*bestScore:
                bestScore = score                
                self.remove_subtree(candidate)
            else:
                tree.children_left[candidate]=left
                tree.children_right[candidate]=right
        assert (self.tree_.children_left>=0).sum() == (self.tree_.children_right>=0).sum() 
        return self
        
    def fit(self,X,Y,sample_weight=None,check_input=True, X_idx_sorted=None):        
        if sample_weight is None:
            sample_weight = np.ones(X.shape[0]) 
        self.trgX = X.copy()
        self.trgY = Y.copy()
        self.trgWts = sample_weight.copy()        
        sss = ms.StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=123)
        for train_index, test_index in sss.split(self.trgX,self.trgY):
            self.valX = self.trgX[test_index]
            self.valY = self.trgY[test_index]
            self.trgX = self.trgX[train_index]
            self.trgY = self.trgY[train_index]
            self.valWts = sample_weight[test_index]
            self.trgWts = sample_weight[train_index]
        super().fit(self.trgX,self.trgY,self.trgWts,check_input,X_idx_sorted)
        self.prune()
        return self
    def __init__(self,
                 criterion="gini",
                 splitter="best",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 min_impurity_split=1e-7,
                 class_weight=None,
                 presort=False,
                 alpha = 0):
        super(dtclf_pruned, self).__init__(
            criterion=criterion,
            splitter=splitter,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            class_weight=class_weight,
            random_state=random_state,
            min_impurity_split=min_impurity_split,
            presort=presort)
        self.alpha = alpha
        
    def numNodes(self):
        assert (self.tree_.children_left>=0).sum() == (self.tree_.children_right>=0).sum() 
        return  (self.tree_.children_left>=0).sum() 

In [None]:

"""
Originally Created on Fri Jan 20 2017
@author: JTay
source = https://github.com/JonathanTay/CS-7641-assignment-1

Altered by Jamie Andrews
February 2019
CS7461 Assignment 1

"""

import numpy as np
import sklearn.model_selection as ms
import pandas as pd
from helpers import  basicResults,makeTimingCurve,iterationLC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import euclidean_distances
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel


In [None]:
class primalSVM_RBF(BaseEstimator, ClassifierMixin):
    '''http://scikit-learn.org/stable/developers/contributing.html'''
    
    def __init__(self, alpha=1e-9,gamma_frac=0.1,max_iter=2000):
        self.alpha = alpha
        self.gamma_frac = gamma_frac
        self.max_iter = max_iter
         
    def fit(self, X, y):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
         
        # Get the kernel matrix
        dist = euclidean_distances(X,squared=True)
        median = np.median(dist) 
        del dist
        gamma = median
        gamma *= self.gamma_frac
        self.gamma = 1/gamma
        kernels = rbf_kernel(X,None,self.gamma )
         
        self.X_ = X
        self.classes_ = unique_labels(y)
        self.kernels_ = kernels
        self.y_ = y
        self.clf = SGDClassifier(loss='hinge',penalty='l2',alpha=self.alpha,
                                  l1_ratio=0,fit_intercept=True,verbose=False,
                                  average=False,learning_rate='optimal',
                                  class_weight='balanced',max_iter=self.max_iter,
                                  random_state=55)         
        self.clf.fit(self.kernels_,self.y_)
         
         # Return the classifier
        return self

    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_','clf','kernels_'])
        # Input validation
        X = check_array(X)
        new_kernels = rbf_kernel(X,self.X_,self.gamma )
        pred = self.clf.predict(new_kernels)
        return pred
    



In [None]:

# # GET THE DATA
# wine data
file_path ="./data/"
wine = pd.read_csv (file_path+'wines.csv', sep =",")
# credit data
file_path2 ="./data/"
credit = pd.read_csv (file_path2+'credit.csv', sep =",")

# limit credit data set to 10,000 observations so select rows at random
credit = credit.sample(n=9999)
len(credit)



In [None]:
# DIVIDE INTO TRAIN AND TEST SETS

#credit = pd.read_hdf('datasets.hdf','credit')        
creditX = credit.drop('default',1).copy().values
creditY = credit['default'].copy().values

#wine = pd.read_hdf('datasets.hdf','wine')        
wineX = wine.drop('quality',1).copy().values
wineY = wine['quality'].copy().values

credit_trgX, credit_tstX, credit_trgY, credit_tstY = ms.train_test_split(creditX, creditY, test_size=0.3, random_state=0,stratify=creditY)     
wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY)     


In [None]:
N_credit = credit_trgX.shape[0]
N_wine = wine_trgX.shape[0]

alphas = [10**-x for x in np.arange(1,4.01,1/2)]



In [None]:

### Linear SVM ###

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    #warnings.filterwarnings("ignore",category=DataConversionWarning)
    #import md5, sha
    
    
    pipeW = Pipeline([('Scale',StandardScaler()),
    #                 ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
    #                 ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
    #                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
    #                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
                    ('SVM',SGDClassifier(loss='hinge',l1_ratio=0,penalty='l2',
                                         class_weight='balanced',random_state=55))])
    pipeC = Pipeline([('Scale',StandardScaler()),                
                    ('SVM',SGDClassifier(loss='hinge',l1_ratio=0,penalty='l2',
                                         class_weight='balanced',random_state=55))])

    params_credit = {'SVM__alpha':alphas,'SVM__max_iter':[int((1e6/N_credit)/.8)+1]}
    params_wine = {'SVM__alpha':alphas,'SVM__max_iter':[int((1e6/N_wine)/.8)+1]}

    wine_clf = basicResults_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_Lin','wine')        
    credit_clf = basicResults_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,params_credit,'SVM_Lin','credit')        
    


In [None]:
# LINEAR SVM CONTINUED...

# Get paramters
wine_final_params = wine_clf.best_params_
credit_final_params =credit_clf.best_params_

print(wine_final_params)
print(credit_final_params)



In [None]:
# LINEAR SVM CONTINUED...

# use final params to estimate OF params

#wine_final_params = {'SVM__alpha': 0.01, 'SVM__max_iter': 110}
wine_OF_params = {'SVM__max_iter': 275, 'SVM__alpha': 0.001}
#credit_final_params ={'SVM__alpha': 0.0031622776601683794, 'SVM__max_iter': 60}
credit_OF_params ={'SVM__max_iter': 179, 'SVM__alpha': 0.1}
print(credit_OF_params, wine_OF_params)

In [None]:
# Timing Curve - Wine
pipeW.set_params(**wine_final_params)                     
makeTimingCurve(wineX,wineY,pipeW,'SVM_Lin','wine')


In [None]:
# Timing Curve - Credit
pipeC.set_params(**credit_final_params)
makeTimingCurve(creditX,creditY,pipeC,'SVM_Lin','credit')


In [None]:
# Learning Curve - Wine
pipeW.set_params(**wine_final_params)
iterationLC(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,
            {'SVM__max_iter':np.arange(1,500,50)},'SVM_Lin','wine')        


In [None]:
# Learning Curve - Credit
pipeC.set_params(**credit_final_params)
iterationLC(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,
            {'SVM__max_iter':np.arange(1,300,50)},'SVM_Lin','credit')                


In [None]:
#  wine
pipeW.set_params(**wine_OF_params)
iterationLC_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,
            {'SVM__max_iter':np.arange(1,500,50)},'SVM_LinOF','wine')                


In [None]:
# credit
pipeC.set_params(**credit_OF_params)
iterationLC_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,
            {'SVM__max_iter':np.arange(1,300,50)},'SVM_LinOF','credit')  


### END LINEAR SVM ###

In [None]:
### BEGIN Radial Basis Function SVM  ###

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)

# wine has smaller sample size so each point is more important (larger gamma)   
gamma_fracsW = np.arange(0.2,2.1,0.4)
gamma_fracsC = np.arange(0.05,1.01,0.3)

#
pipeW = Pipeline([('Scale',StandardScaler()),
                  # no need for feature selection
#                  ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
#                  ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
#                  ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                  ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
                 ('SVM',primalSVM_RBF())])

pipeC = Pipeline([('Scale',StandardScaler()),
                 ('SVM',primalSVM_RBF())])


params_credit = {'SVM__alpha':alphas,'SVM__max_iter':[int((1e6/N_credit)/.8)+1],'SVM__gamma_frac':gamma_fracsC}
params_wine = {'SVM__alpha':alphas,'SVM__max_iter':[int((1e6/N_wine)/.8)+1],'SVM__gamma_frac':gamma_fracsW}
#                                                  
wine_clf = basicResults_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_RBF','wine')        
credit_clf = basicResults_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,params_credit,'SVM_RBF','credit')        




In [None]:
# get params - wine
wine_final_params = wine_clf.best_params_

# get params - credit 
credit_final_params =credit_clf.best_params_

print(wine_final_params)
print(credit_final_params)


In [None]:
wine_OF_params = wine_final_params.copy()
# select best alpha from wine_final_params
wine_OF_params['SVM__alpha'] = 0.001 #0.00031622776601683794

credit_OF_params = credit_final_params.copy()
# select best alpha from wine_final_params
credit_OF_params['SVM__alpha'] = 0.01    #0.01

In [None]:
# Timing Curve _ wine
pipeW.set_params(**wine_final_params)                     
makeTimingCurve(wineX,wineY,pipeW,'SVM_RBF','wine')


In [None]:
# Timing Curve - credit
pipeC.set_params(**credit_final_params)
makeTimingCurve(creditX,creditY,pipeC,'SVM_RBF','credit')



In [None]:
# Learning Curve - wine
pipeW.set_params(**wine_final_params)
iterationLC_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,
                 {'SVM__max_iter':np.arange(1,500,50)},'SVM_RBF','wine')        
           

In [None]:
# Learning Curve - Credit
pipeC.set_params(**credit_final_params)
iterationLC_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,
                   {'SVM__max_iter':np.arange(1,301,50)},'SVM_RBF','credit')                



In [None]:
# wine
pipeW.set_params(**wine_OF_params)
iterationLC_wine(pipeW,wine_trgX,wine_trgY,wine_tstX,wine_tstY,
                 {'SVM__max_iter':np.arange(1,501,50)},'SVM_RBF_OF','wine')                


In [None]:
# credit
pipeC.set_params(**credit_OF_params)
iterationLC_credit(pipeC,credit_trgX,credit_trgY,credit_tstX,credit_tstY,
                   {'SVM__max_iter':np.arange(1,301,50)},'SVM_RBF_OF','credit')     

### END RBF SVM ####