In [12]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split,KFold,RepeatedKFold
from sklearn.metrics import accuracy_score,log_loss
from sklearn.neural_network import multilayer_perceptron,MLPClassifier
from sklearn.model_selection import GridSearchCV,PredefinedSplit
from sklearn.feature_selection import SelectKBest,chi2,SelectFromModel
from sklearn.svm import SVC

In [2]:
warnings.filterwarnings('ignore')
np.random.seed(2)
pd.set_option('display.max_colwidth',100)

<b> Q1 [3.5pts]</b>: <br/>Implement the logistic regression (i.e. 1 layer neural network with a
single sigmoidal output) algorithm yourself in Python, use adaptive learning rate
and momentum for training. Train and test 10 times, each time, start from different
random initial weights and use a random subset of 80% of the training data and also
start with a different initial learning rate (e.g. 0.0001, 0.005, 0.001, 0.01 etc.) and
momentum (0.9, 0.95, 0.99). Report the total training and test errors for each of the
10 runs. Report also the initial learning rate and the momentum you used.
Note that you need to replace the 7s in the target column with 0s. 

In [3]:
data_t=pd.read_csv("optdigits.tra",header=None)
data_t=data_t[((data_t.iloc[:,-1]==1) | (data_t.iloc[:,-1]==7))]
data_t.replace({7:0},inplace=True)
Dimensions=64
models=[]
data_p=[]
kFold=KFold(n_splits=10,shuffle=True,random_state=2)
for train_index,test_index in kFold.split(data_t):
    data_p.append((data_t.iloc[train_index,:-1].values,data_t.iloc[train_index,-1].values,data_t.iloc[test_index,:-1].values,data_t.iloc[test_index,-1].values))

In [4]:
class Logistic_Regression:
    def __init__(self,lr_rate,momentum,dimensions,p,q):
        self.lr_rate=lr_rate
        self.momentum=momentum
        self.dimensions=dimensions
        self.weights=np.random.uniform(low=0,high=0.0005,size=(dimensions,1))
        self.p=p
        self.q=q
    def Print_Model_Params(self,i):
        print("*"*70)
        print("Run:{0}\nInitial learning rate:{1}\nInitial momentum:{2}".format(i,self.lr_rate,self.momentum))
    def sigmoid_function(self,x):
        return 1/(1+np.exp(-x))
    def cross_entropy(self,y_true,y_pred):
        return log_loss(y_true,y_pred)
    def train(self,x_train,y_train,n_iter=50):
        err=[]
        converged=False
        while converged==False and n_iter!=0:
            error=0
            n_iter-=1
            predictions=[]
            temp=np.zeros(shape=(self.dimensions))
            d_weights=np.zeros(shape=(self.dimensions))
            for i in range(x_train.shape[0]):
                O=0
                for j in range(x_train.shape[1]):
                    O+=(self.weights[j]*x_train[i,j])
                y_pred=self.sigmoid_function(O)
                for j in range(x_train.shape[1]):
                    d_weights[j]=(self.lr_rate*(y_train[i]-y_pred)*x_train[i,j])+(self.momentum*d_weights[j])
                predictions.append(y_train[i]-y_pred)
            isDecreasing=True
            err.append(self.cross_entropy(y_train,np.array(predictions)))
            curr_error=err[-1]
            for i in range(len(err)-1):
                if err[i]<=curr_error:
                    isDecreasing=False
                    break
            if isDecreasing:
                self.lr_rate+=self.p
            else:
                self.lr_rate-=(self.q*self.lr_rate)
            for j in range(self.dimensions):
                self.weights[j]+=(d_weights[j])
            try:
                if err[-2]==err[-1]:
                    converged=True
            except IndexError:
                continue
        self.evaluate("Training",y_train,self.predict(x_train))
    def predict(self,x):
        y_pred=self.sigmoid_function(np.sum(x*self.weights.T,axis=1))
        y_pred=np.array(y_pred>0.5,dtype=np.int16)
        return y_pred
    def evaluate(self,string,y_true,y_pred):
        print("{0} error rate for this run :{1}".format(string,1-accuracy_score(y_true,y_pred)))

In [5]:
for i in range(10):
    lr_rate=np.random.uniform(low=0.0001,high=0.001)
    momentum=np.random.uniform(low=0.9,high=0.95)
    model=Logistic_Regression(lr_rate,momentum,Dimensions,0.0001,0.0002)
    x_train,y_train,x_test,y_test=data_p[i]
    model.Print_Model_Params(i)
    model.train(x_train,y_train,10)
    models.append(model)
    model.evaluate("Test",y_test,model.predict(x_test))

**********************************************************************
Run:0
Initial learning rate:0.0004923954119278034
Initial momentum:0.9012963115913946
Training error rate for this run :0.015759312320916874
Test error rate for this run :0.012820512820512775
**********************************************************************
Run:1
Initial learning rate:0.000420961853010483
Initial momentum:0.90228394826227
Training error rate for this run :0.017191977077363862
Test error rate for this run :0.0
**********************************************************************
Run:2
Initial learning rate:0.0002934319457176257
Initial momentum:0.9142801866955451
Training error rate for this run :0.015759312320916874
Test error rate for this run :0.012820512820512775
**********************************************************************
Run:3
Initial learning rate:0.0009250431507620943
Initial momentum:0.9351132120409862
Training error rate for this run :0.012893982808022897
Test error rate for

<b>Q2 [2pts]:</b><br/> Using the 10 runs in Q1, for each feature compute feature importance as
the average of the absolute value of the logistic regression model weight connected
to that specific feature.<br/>
Eliminate 10%, 25%, 50% of the least important features and train and test again
with the same training instances (but with the features you selected) as in Q1.
Report and training and test errors for each of the 10 runs and three different levels
of feature selection.<br/><br/>
Did feature selection help?<br/>
Yes,having irrelevant features in your data can decrease the accuracy of the models and make your model learn based on irrelevant features.<br/>

Which features were eliminated for 10% elimination and is there a reason why they
were eliminated?<br/>
Features having more number of indices zeros are eliminate becouse having a lot of zeros does not necessarily mean you have zero inflation.


In [6]:
Average_weights=[]
for model in models:
    Average_weights.append(np.abs(model.weights))
Average_weights=np.array(Average_weights)
Average_weights=Average_weights.sum(axis=0)

In [7]:
Eliminating_ft_10=Average_weights.argsort(axis = 0)[:int(0.9*Dimensions)]
Eliminating_ft_25=Average_weights.argsort(axis = 0)[:int(0.75*Dimensions)]
Eliminating_ft_50=Average_weights.argsort(axis = 0)[:int(0.5*Dimensions)]

In [8]:
for i in range(10):
    lr_rate=np.random.uniform(low=0.0001,high=0.001)
    momentum=np.random.uniform(low=0.9,high=0.99)
    model=Logistic_Regression(lr_rate,momentum,Eliminating_ft_10.shape[0],0.0001,0.0002)
    x_train,y_train,x_test,y_test=data_p[i]
    model.Print_Model_Params(i)
    model.train(x_train[:,Eliminating_ft_10].reshape(x_train.shape[0],Eliminating_ft_10.shape[0]),y_train,10)
    model.evaluate("Test",y_test,model.predict(x_test[:,Eliminating_ft_10].reshape(x_test.shape[0],Eliminating_ft_10.shape[0])))

**********************************************************************
Run:0
Initial learning rate:0.0008699886819074396
Initial momentum:0.9180176709412956
Training error rate for this run :0.3194842406876791
Test error rate for this run :0.3589743589743589
**********************************************************************
Run:1
Initial learning rate:0.0008310286747364229
Initial momentum:0.9042416709506133
Training error rate for this run :0.2664756446991404
Test error rate for this run :0.2435897435897436
**********************************************************************
Run:2
Initial learning rate:0.0009318874924911064
Initial momentum:0.9488630617509457
Training error rate for this run :0.33954154727793695
Test error rate for this run :0.2948717948717948
**********************************************************************
Run:3
Initial learning rate:0.00034884516528538263
Initial momentum:0.9648582591187477
Training error rate for this run :0.26361031518624645
Test error

In [9]:
for i in range(10):
    lr_rate=np.random.uniform(low=0.0001,high=0.001)
    momentum=np.random.uniform(low=0.9,high=0.99)
    model=Logistic_Regression(lr_rate,momentum,Eliminating_ft_25.shape[0],0.0001,0.0002)
    x_train,y_train,x_test,y_test=data_p[i]
    model.Print_Model_Params(i)
    model.train(x_train[:,Eliminating_ft_25].reshape(x_train.shape[0],Eliminating_ft_25.shape[0]),y_train,10)
    model.evaluate("Test",y_test,model.predict(x_test[:,Eliminating_ft_25].reshape(x_test.shape[0],Eliminating_ft_25.shape[0])))

**********************************************************************
Run:0
Initial learning rate:0.0007468674923132592
Initial momentum:0.9101830306927032
Training error rate for this run :0.47851002865329517
Test error rate for this run :0.5
**********************************************************************
Run:1
Initial learning rate:0.0008951181896967383
Initial momentum:0.9762157598538008
Training error rate for this run :0.42836676217765046
Test error rate for this run :0.4358974358974359
**********************************************************************
Run:2
Initial learning rate:0.0003913870158216833
Initial momentum:0.912420232790289
Training error rate for this run :0.4842406876790831
Test error rate for this run :0.42307692307692313
**********************************************************************
Run:3
Initial learning rate:0.00010884087127594955
Initial momentum:0.9037609004038181
Training error rate for this run :0.21060171919770776
Test error rate for this

In [10]:
for i in range(10):
    lr_rate=np.random.uniform(low=0.0001,high=0.001)
    momentum=np.random.uniform(low=0.9,high=0.99)
    model=Logistic_Regression(lr_rate,momentum,Eliminating_ft_50.shape[0],0.0001,0.0002)
    x_train,y_train,x_test,y_test=data_p[i]
    model.Print_Model_Params(i)
    model.train(x_train[:,Eliminating_ft_50].reshape(x_train.shape[0],Eliminating_ft_50.shape[0]),y_train,10)
    model.evaluate("Test",y_test,model.predict(x_test[:,Eliminating_ft_50].reshape(x_test.shape[0],Eliminating_ft_50.shape[0])))

**********************************************************************
Run:0
Initial learning rate:0.0009303725582888083
Initial momentum:0.9170287747345163
Training error rate for this run :0.498567335243553
Test error rate for this run :0.46153846153846156
**********************************************************************
Run:1
Initial learning rate:0.0005293172721071327
Initial momentum:0.9121856035892043
Training error rate for this run :0.44269340974212035
Test error rate for this run :0.47435897435897434
**********************************************************************
Run:2
Initial learning rate:0.0005286262664736112
Initial momentum:0.9190968524630521
Training error rate for this run :0.4455587392550143
Test error rate for this run :0.5256410256410257
**********************************************************************
Run:3
Initial learning rate:0.000544734367997459
Initial momentum:0.9425738981319378
Training error rate for this run :0.33810888252148996
Test error 

<b>Q3 [2pts]:</b> <br/> Use the scikit-learn neural network implementation to train a neural
network and test it using the same instances as in Q1. Increase the test error as
much as you can through selection of:<br/>
-different number of hidden layers and units,<br/>
-L1 or L2 regularization/weight decay,<br/>
-different optimization algorithms,<br/>
-feature selection, etc.

In [11]:
params={
    'hidden_layer_sizes':[(10,20),(50,30),(100,10,2),(30,10,2)],
    'solver':['adam','sgd'],
    'alpha':[0.0001,0.00001,0.001,0.1]
}
sk_models=[]
w_parameters=[]
scores=[]
k_best=[]
tree_selection=[]
for i,(x_train,y_train,x_test,y_test) in enumerate(data_p):
    print("\nRun {0}".format(i))
    model=GridSearchCV(MLPClassifier(),param_grid=params,n_jobs=-1,cv=RepeatedKFold(n_splits=2,n_repeats=1,random_state=2),verbose=3,scoring='neg_log_loss')
    model.fit(x_train,y_train)
    w_t_error=model.cv_results_['mean_test_score']
    w_parameters.append(model.cv_results_['params'][np.argmin(w_t_error)])
    test_model=MLPClassifier(**w_parameters[-1])
    test_model.fit(x_train,y_train)
    w_t_error=model.cv_results_['mean_test_score']
    
    scores.append(1-accuracy_score(y_test,test_model.predict(x_test)))
    selector=SelectKBest(chi2,k=45)
    x_train_=selector.fit_transform(x_train,y_train)
    x_test_=selector.transform(x_test)
    test_model.fit(x_train_,y_train)
    k_best.append(1-accuracy_score(y_test,test_model.predict(x_test_)))
    
    selector=SelectFromModel(SVC(kernel='linear'))
    x_train_=selector.fit_transform(x_train,y_train)
    x_test_=selector.transform(x_test)
    test_model.fit(x_train_,y_train)
    tree_selection.append(1-accuracy_score(y_test,test_model.predict(x_test_)))
    
    sk_models.append(model)
scores=np.array(scores)
w_parameters=np.array(w_parameters)
df=pd.DataFrame({"Test scores":scores,"params":w_parameters,"Test error for K Best feature selection":k_best,"Test error for Tree based selection":tree_selection})
print(df)




Run 0
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    5.1s finished



Run 1
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    3.5s finished



Run 2
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.0s finished



Run 3
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.2s finished



Run 4
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.0s finished



Run 5
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.0s finished



Run 6
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.7s finished



Run 7
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    3.9s finished



Run 8
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.0s finished



Run 9
Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    4.2s finished


   Test scores  \
0     0.000000   
1     0.500000   
2     0.000000   
3     0.000000   
4     0.012821   
5     0.025641   
6     0.000000   
7     0.000000   
8     0.012987   
9     0.000000   

                                                                    params  \
0    {'alpha': 0.001, 'hidden_layer_sizes': (100, 10, 2), 'solver': 'sgd'}   
1  {'alpha': 0.0001, 'hidden_layer_sizes': (100, 10, 2), 'solver': 'adam'}   
2     {'alpha': 1e-05, 'hidden_layer_sizes': (30, 10, 2), 'solver': 'sgd'}   
3     {'alpha': 0.001, 'hidden_layer_sizes': (30, 10, 2), 'solver': 'sgd'}   
4    {'alpha': 1e-05, 'hidden_layer_sizes': (30, 10, 2), 'solver': 'adam'}   
5    {'alpha': 0.0001, 'hidden_layer_sizes': (30, 10, 2), 'solver': 'sgd'}   
6   {'alpha': 0.001, 'hidden_layer_sizes': (100, 10, 2), 'solver': 'adam'}   
7      {'alpha': 0.1, 'hidden_layer_sizes': (100, 10, 2), 'solver': 'sgd'}   
8   {'alpha': 0.0001, 'hidden_layer_sizes': (30, 10, 2), 'solver': 'adam'}   
9   {'alpha': 0.0001,