In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [309]:
random_state = 8
data = pd.read_csv('EEG_Eye_State.csv')
print(data.shape)
print(data)
X = data.loc[:,data.columns !='Eye_detection']
# y = data.loc[:,data.columns =='Eye_detection']
y = np.array(data['Eye_detection'],dtype=np.float64)
print(y)


(14980, 15)
           AF3       F7       F3      FC5       T7       P7       O1       O2  \
0      4329.23  4009.23  4289.23  4148.21  4350.26  4586.15  4096.92  4641.03   
1      4324.62  4004.62  4293.85  4148.72  4342.05  4586.67  4097.44  4638.97   
2      4327.69  4006.67  4295.38  4156.41  4336.92  4583.59  4096.92  4630.26   
3      4328.72  4011.79  4296.41  4155.90  4343.59  4582.56  4097.44  4630.77   
4      4326.15  4011.79  4292.31  4151.28  4347.69  4586.67  4095.90  4627.69   
...        ...      ...      ...      ...      ...      ...      ...      ...   
14975  4281.03  3990.26  4245.64  4116.92  4333.85  4614.36  4074.87  4625.64   
14976  4276.92  3991.79  4245.13  4110.77  4332.82  4615.38  4073.33  4621.54   
14977  4277.44  3990.77  4246.67  4113.85  4333.33  4615.38  4072.82  4623.59   
14978  4284.62  3991.79  4251.28  4122.05  4334.36  4616.41  4080.51  4628.72   
14979  4287.69  3997.44  4260.00  4121.03  4333.33  4616.41  4088.72  4638.46   

            P8 

In [154]:
# Split into train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
# Train a multi-layer perceptron
clf0 = MLPClassifier(hidden_layer_sizes=(100,100),random_state=random_state, verbose=False, max_iter=1000)
clf0.fit(X_train, y_train)
# Predict accuracy of classifier
y_pred = clf0.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy on raw : ', acc*100)


Accuracy on raw :  57.510013351134845


In [155]:
#Feature scaling
scaled_X = preprocessing.scale(X)
# Split into train and test data 
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state = random_state)
# Train a multi-layer perceptron
clf0 = MLPClassifier(hidden_layer_sizes=(100,100),random_state=random_state, verbose=False, max_iter=1000)
clf0.fit(X_train, y_train)
# Predict accuracy of classifier
y_pred = clf0.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy on raw scaled : ', acc*100)


Accuracy on raw scaled :  90.82109479305741


In [296]:
################ split data #######################
# train: 0.1 total  
# pool: 0.5 total
# valid: 0.2 total
# test: 0.2 total
# ################################################# 

def split_data(scaled_X, y, noise_probability = 0.0):
    noise_gt = np.zeros_like(y)
    noise_proba = np.zeros_like(y)
    y_with_noise = np.stack((y,noise_gt,noise_proba),axis=1) 
    
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y_with_noise, test_size = 0.2, random_state = random_state)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.0834, random_state = random_state)
    np.random.seed(random_state)
    #adding noise to train and pool ,  validation split and test split don't have noise
    y_train_noise = np.random.random(y_train.shape[0])<noise_probability
    y_train[:,0] = np.abs(y_train_noise -y_train[:,0])
    y_train[:,1] = y_train_noise.astype(np.float32)
    
    X_train, X_pool, y_train, y_pool = train_test_split(X_train, y_train, test_size = 0.6873, random_state = random_state)

    print ("---------")
    print(f"total: {y.size}\ntrain: {y_train.shape} -> {y_train.shape[0]/y.size:.2f}x \npool: {y_pool.shape} -> {y_pool.shape[0]/y.size:.2f}x")
    print(f"valid: {y_valid.shape[0]} -> {y_valid.shape[0]/y.size:.2f}x \ntest: {y_test.shape[0]} -> {y_test.shape[0]/y.size:.2f}x")
    print ("---------")
    
    return X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test


In [295]:

print(np.sum(y_train[:,1]))
print(y_train.shape)


1005.0
(3434, 3)


In [293]:
X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test = split_data(scaled_X, y, noise_probability=0.3)

---------
total: 14980
train: (3434, 3) -> 0.23x 
pool: (7550, 3) -> 0.50x
valid: 1000 -> 0.07x 
test: 2996 -> 0.20x
---------


In [297]:


def find_most_ambigious(y_proba_pred, ambigious_amount =1, method='least_confidence') -> list:
    """This function finds most ambigous predicted data and returns their indexes. It assumes
        we have only two class.

	Args:
		y_proba_pred ([list]): [predicted probabilities]
		y ([list]): [ground truth labels]
		ambigious_amount (int, optional): [quantity of most ambigious]. Defaults to 1.
		method (str, optional): [method type i.e, least_confidence]. Defaults to 'least_confidence'.

	Returns:
		indexes ([list]): [indexes of the most ambigious]
	"""
    indexes = []
    if method == 'least_confidence':
        difference = np.abs(y_proba_pred[:,0]-y_proba_pred[:,1])
        indexes = np.argsort(difference)[:ambigious_amount]
    else:
        print("method is not defined. Use 'least_confidence'")
        
    return indexes

def train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, model, ambigious_amount=1  , method='least_confidence'):

    y_proba_pred = model.predict_proba(X_pool)

    most_ambigious_indexes = find_most_ambigious(y_proba_pred, ambigious_amount=ambigious_amount, method='least_confidence')
    
    X_train = np.append(X_train,X_pool[most_ambigious_indexes],axis = 0)
    y_train = np.append(y_train, y_pool[most_ambigious_indexes],axis = 0)
    X_pool = np.delete(X_pool,most_ambigious_indexes,axis=0)
    y_pool = np.delete(y_pool,most_ambigious_indexes,axis=0)
#     new_y_train = logodds_to_y(y_train)
    model.fit(X_train,y_train[:,0])
    acc = model.score(X_test,y_test[:,0])

    return X_train, y_train, X_pool, y_pool, model, acc



In [298]:
def update_logodds(y_train, inlier_idx, outlier_idx):
    likelihood_of_noise = 0.7
    log_ratio_likelihood_of_noise = np.log(likelihood_of_noise/(1-likelihood_of_noise))
    y_train[outlier_idx,2] += log_ratio_likelihood_of_noise
    y_train[inlier_idx,2] -= log_ratio_likelihood_of_noise
    return y_train

def logodds_to_y(y_train):
    p_noise = np.exp(y_train[:,2])/(1+np.exp(y_train[:,2]))
    new_y_train =  np.abs(y_train[:,0] - p_noise)
    return new_y_train

[0.00615824 0.03263497 0.00615824 ... 0.3        0.3        0.3       ]
(3734,)


In [299]:
## pure active learning
X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test = split_data(scaled_X, y, noise_probability=0.3)
X_train = np.append(X_train,X_valid,axis=0)
y_train = np.append(y_train,y_valid,axis=0)

print(f"train size: {y_train.shape}")
clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state, max_iter=1000)

clf1.fit(X_train, y_train[:,0])
acc = clf1.score(X_test,y_test[:,0])

print (f"iteration -1:   accuracy = {acc:0.4f}")

print ("--")
K = 6
for k in range(K):
    X_train, y_train, X_pool, y_pool, clf1, acc = train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, model=clf1, ambigious_amount=50 , method='least_confidence')
    print(f"train size: {y_train.shape}")

    print (f"iteration {k}:   accuracy = {acc:0.14f}")
    print ("--")

print(f"train size: {y_train.shape}")

---------
total: 14980
train: (3434, 3) -> 0.23x 
pool: (7550, 3) -> 0.50x
valid: 1000 -> 0.07x 
test: 2996 -> 0.20x
---------
train size: (4434, 3)
iteration -1:   accuracy = 0.7360
--
train size: (4484, 3)
iteration 0:   accuracy = 0.73397863818425
--
train size: (4534, 3)
iteration 1:   accuracy = 0.75200267022697
--
train size: (4584, 3)
iteration 2:   accuracy = 0.72329773030708
--
train size: (4634, 3)
iteration 3:   accuracy = 0.73731642189586
--
train size: (4684, 3)
iteration 4:   accuracy = 0.72797062750334
--
train size: (4734, 3)
iteration 5:   accuracy = 0.74899866488652
--
train size: (4734, 3)


In [300]:
## Active learning with Ransac

X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test = split_data(scaled_X, y,noise_probability = 0.3)

# X_train = np.append(X_train,X_valid,axis=0)
# y_train = np.append(y_train,y_valid,axis=0)

clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state,max_iter=1000)
clf1.fit(X_train, y_train[:,0])
acc = clf1.score(X_test,y_test[:,0])
print (f"iteration -1:   accuracy = {acc:0.4f}")
print ("--")

#################
K = 1
M = 10
N = 6

#### in total N*k times active learning iterations, N*M times Ransac iterations ####
for n in range(N):

    print(f"################ Outer iteration {n} ################ ")

    ############ k iteration active learning -> everytime get m 10 ############
    for k in range(K):
        X_train, y_train, X_pool, y_pool, clf1, acc = train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, clf1, ambigious_amount=50 , method='least_confidence')
        print (f"AL iteration {k}:   accuracy = {acc:0.4f}")
    print(f"train size: {y_train.shape}")
    print ("--------")
    
    #####################################################

    stats_history =[]
    for m in range(M):
        ransac_random_state = random_state + m # to make sure repeatable results
        indices = np.arange(y_train.shape[0])

        r_X_train,r_X_outlier, r_y_train, r_y_outlier, r_X_train_idx, r_X_outlier_idx = train_test_split(X_train, y_train, indices, test_size = 0.05, random_state = ransac_random_state)
        clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state, max_iter=1000)
        clf1.fit(r_X_train, r_y_train[:,0])
        acc = clf1.score(X_valid, y_valid[:,0])
        print (f"Ransac iteration {m}:   accuracy = {acc:0.14f}")
        
        stat = {"model":clf1, "accuracy":acc, 'X_train_inlier_idx':r_X_train_idx, 'X_train_outlier_idx':r_X_outlier_idx}
        
        stats_history.append(stat)

    # Take the best model and 95% data that gives best accuracy     
    best = sorted(stats_history, key=lambda x: x["accuracy"])[-1]
    clf1 = best["model"]
    inlier_idx = best['X_train_inlier_idx']
    outlier_idx = best['X_train_outlier_idx']
    y_train = update_logodds(y_train, inlier_idx, outlier_idx)

    print ("----------------")

    acc = clf1.score(X_test, y_test[:,0])
    print(f"train size: {y_train.shape}")
    print (f"Final accuracy = {acc:0.4f}")
    print ("----------------")




---------
total: 14980
train: (3434, 3) -> 0.23x 
pool: (7550, 3) -> 0.50x
valid: 1000 -> 0.07x 
test: 2996 -> 0.20x
---------
iteration -1:   accuracy = 0.6756
--
################ Outer iteration 0 ################ 
AL iteration 0:   accuracy = 0.6729
train size: (3484, 3)
--------
Ransac iteration 0:   accuracy = 0.66100000000000
Ransac iteration 1:   accuracy = 0.65900000000000
Ransac iteration 2:   accuracy = 0.66700000000000
Ransac iteration 3:   accuracy = 0.66000000000000
Ransac iteration 4:   accuracy = 0.64500000000000
Ransac iteration 5:   accuracy = 0.63700000000000
Ransac iteration 6:   accuracy = 0.67500000000000
Ransac iteration 7:   accuracy = 0.64500000000000
Ransac iteration 8:   accuracy = 0.66500000000000
Ransac iteration 9:   accuracy = 0.66200000000000
----------------
train size: (3484, 3)
Final accuracy = 0.6836
----------------
################ Outer iteration 1 ################ 
AL iteration 0:   accuracy = 0.6766
train size: (3534, 3)
--------
Ransac iteration

In [308]:
total = 0
found = 0
correct = 0
y_train_2 = np.zeros_like(y_train)
print(y_train.shape)

for i in range(y_train.shape[0]):
#     print(y_train[i])
#     y_train_2[i] = y_train[i]
    
    if y_train[i][1]==1.0:
        total+=1
    if y_train[i][1]==1.0 and y_train[i][2]>0:
        correct +=1
    if y_train[i][2]>0:
        found +=1
#         y_train_2[i][0] = np.abs(1-y_train[i][0])
        
#     if y_train[i][1]==0 and y_train[i][2]<0:
#         correct +=1
#         y_train_2[i][0] = np.abs(1-y_train[i][0])

#     print(y_train[i])


print('total:',total)
print('found:',found)
print('correct:',correct)

(3734, 3)
total: 1080
found: 9
correct: 2


In [270]:
y_train

array([[1., 1., 0.],
       [1., 1., 0.],
       [1., 1., 0.],
       ...,
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [266]:
clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state,max_iter=1000)
clf1.fit(X_train, np.abs(y_train[:,0]-y_train[:,1]))
acc = clf1.score(X_test,y_test[:,0])
print (f"iteration -1:   accuracy = {acc:0.4f}")
print ("--")

iteration -1:   accuracy = 0.5557
--
