In [None]:
# *********************************************************************************
# 
# Project to course Neural networks in applications
# 
# Author: Bistakova Lenka, Brnovik Diana
# PEF MENDELU 2020
# 
# Used dataset: Student Alcohol Consumption - Portuguese language course dataset
# URL of dataset: https://data.world/data-society/student-alcohol-consumption
# 
# *********************************************************************************

In [None]:
import numpy as np
import pylab as pl
import pandas as pd
import neurolab as nl
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from scipy import stats

In [None]:
def convert(list): 
    r = []
    for l in list:
        c = np.array(l)
        if (c==[0,0]).all():
            r.append(1)
        elif (c==[0,1]).all():
            r.append(2)
        elif (c==[1,0]).all():
            r.append(3)
        elif (c==[1,1]).all():
            r.append(4)
        else:
            r.append(5)
    return r 

In [None]:
### Load data

data = pd.DataFrame(pd.read_csv('../data/alcohol2.csv'))
X = (data).to_numpy()[:,:16]
Y = (data).to_numpy()[:,16]

In [None]:
### Target transform

# class 1--4 --> alcohol consumption (1=low, 4=high)
val_map = {1: [0,0], 2: [0,1], 3: [1,0], 4: [1,1]}
T = np.array([val_map[y] for y in Y])
data['alcohol_consumption'] = data['alcohol_consumption'].map( {1: [0,0], 2: [0,1], 3: [1,0], 4: [1,1]})

In [None]:
### Visualization of input data in curves

for column in data:
    feature = data[column]
    if column != 'alcohol_consumption':
        x_axis = []
        y_axis = []
        for i in range(1,5):
            indices = np.where(Y == i)
            feature_filtrated = np.array(feature)[indices]
            unique, counts = np.unique(feature_filtrated, return_counts=True)
            y_axis.append(counts)
            x_axis.append(unique)
        np_y_axis = np.array(y_axis)
        np_x_axis = np.array(x_axis)
        labels = ['very low','low','medium','high and very high']
        plt.plot(np_x_axis[0],np_y_axis[0],label=labels[0])
        plt.plot(np_x_axis[1],np_y_axis[1],label=labels[1])
        plt.plot(np_x_axis[2],np_y_axis[2],label=labels[2])
        plt.plot(np_x_axis[3],np_y_axis[3],label=labels[3])
        plt.legend()
        plt.xlabel(column, fontsize=18)
        plt.ylabel('Count', fontsize=16)
        plt.show()

In [None]:
### Visualization of input data in histograms

dataVisualize = pd.DataFrame(pd.read_csv('../data/alcohol.csv'))

sns.set(style="darkgrid")
labels = ['very low','low','medium','high and very high']
for i in data:
    if i != 'alcohol_consumption' and i != 'absences' and i != 'final_grade':
        plt.figure(i)
        fig = sns.countplot(x="alcohol_consumption", hue=i, data=dataVisualize)
        fig.set(xlabel=i, ylabel='Count')
        plt.title("Dependency of level of alcohol on {}".format(i))
        fig.set_xticklabels(labels, rotation=45)
        plt.show
        
# different view with multiple attributes
g = sns.catplot(x="study_time", hue="sex", col="alcohol_consumption",
                data=dataVisualize, kind="count",
                height=4, aspect=.7);

In [None]:
### Pre-process data using MinMaxScaler() [0,1]

min_max_scaler = preprocessing.MinMaxScaler()
X_process = min_max_scaler.fit_transform(X)

In [None]:
### Constants

train_and_valid = 0.90 # CAN BE CHANGED
valid_fix = 0.20 # DO NOT CHANGE !!!
train_calc = train_and_valid * (1 - valid_fix)
valid_calc = train_and_valid - train_calc
test_calc = 1 - train_calc - valid_calc

train_fraction=train_calc
validation_fraction=valid_calc
test_fraction=test_calc

hidden_layer_neurons = 8
output_layer_neurons = 2

epochs = 10
learning_rate = 0.001
alpha=0.001
goal = 1e-5
tolerance=0.0001
max_iterations=10000
k_cross_validation = 5

#X_train = 64% of features
#X_validation = 16% of features
#X_test = 20% of features

#T_train = 64 % of targets
#T_validation = 16% of targets
#T_test = 20% of targets

#X2_train = 80% of features
#X2_test = 20% of features

#T2_train = 80% of targets
#T2_test = 20% of targets

split_sizes = [int(len(X_process)*train_fraction), int(len(X_process)*(train_fraction+validation_fraction))]
X_train,X_validation,X_test = np.split(X_process, split_sizes)
T_train,T_validation,T_test = np.split(T, split_sizes)
X2_train, X2_test, T2_train, T2_test = train_test_split(
    X_process, T, 
    train_size=train_fraction+validation_fraction, 
    test_size=test_fraction
)

In [None]:
### Train & Validate (Neurolab feed-forward backpropagation network with custom validation implementation)

# Create a neural network (Multilayer feed forward perceptron)
net = nl.net.newff(nl.tool.minmax(X_process),[hidden_layer_neurons,output_layer_neurons])

# Change the transfer function for the output layer
net.layers[0].transf = nl.trans.LogSig() # hidden layer
net.layers[1].transf = nl.trans.LogSig() # output layer

# Train the network & Simulate validation
i = 1
i_max = max_iterations
previous_accuracy = 0
can_try_again = True
while i <= i_max:
    print"\nRepetitions count:", i
    error = net.trainf = nl.train.train_rprop(net, X_train, T_train, epochs=epochs, show=10, lr=learning_rate, goal=0)
    
    plt.plot(error)
    plt.xlabel('Epoch number')
    plt.ylabel('Train error (SSE)')
    plt.grid()
    plt.show()
    
    out_validation = net.sim(X_validation)
    out_validation = np.around(out_validation)
    
    accuracy = np.mean(out_validation == T_validation)
    error_accuracy = 1 - accuracy
    
    print"Validation success:", accuracy*100, "%"
    print"Validation error:", error_accuracy*100, "%"
    print"Number of layers:", len(net.layers)
    print"Learning rate:", learning_rate
    
    for n in range(0,len(net.layers)):
        print"Weights of", n, "layer:", net.layers[n].np['w']
        print"Biases of", n, "layer:", net.layers[n].np['b']
    
    if (accuracy-previous_accuracy <= tolerance) or (i >= i_max):
        if (can_try_again):
            previous_accuracy = accuracy
            i+=1 #training continues
            can_try_again = False
        i=i_max+1 #training stops
    else:
        previous_accuracy = accuracy
        i+=1 #training continues
        can_try_again = True


In [None]:
### Predict (Neurolab feed-forward backpropagation network with custom validation implementation)

out_test = net.sim(X_test)
out_test = np.around(out_test)
accuracy = np.mean(out_test == T_test)
print"Success:", round(accuracy * 100, 2), "%"

In [None]:
### Compare the results with target data (Neurolab feed-forward backpropagation network with custom validation implementation)

T_val_converted = convert(T_validation)
T_test_converted = convert(T_test)
out_val_converted = convert(out_validation)
out_test_converted = convert(out_test)

plt.figure(figsize=(10,3))
plt.plot(T_val_converted,'p')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output of validation data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(out_val_converted,'p')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on validation data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(T_test_converted,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output on test data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(out_test_converted,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on test data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

In [None]:
### Compare 10 random patterns (Neurolab feed-forward backpropagation network with custom validation implementation)

T_test_converted = convert(T_test)
out_test_converted = convert(out_test)
length = 10
chosen_out_test = []
chosen_T_test = []
rand_numbers = np.sort(np.random.randint(len(T_test_converted), size=(length)))
for i in rand_numbers:
    chosen_out_test.append(out_test_converted[i])
    chosen_T_test.append(T_test_converted[i])

plt.figure(figsize=(10,3))
plt.plot(chosen_T_test,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output on test data')
plt.xticks(np.arange(0, 10, 1)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(chosen_out_test,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on test data')
plt.xticks(np.arange(0, 10, 1)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

In [None]:
### Train & Validate (Scikit-learn MLPClassifier)

mlp = MLPClassifier(
    activation='logistic', 
    alpha=alpha, 
    batch_size='auto', 
    early_stopping=False, 
    hidden_layer_sizes=(hidden_layer_neurons), 
    learning_rate='constant',
    learning_rate_init=learning_rate, 
    max_iter=max_iterations,
    shuffle=True,
    solver='adam',
    tol=tolerance,
    validation_fraction=validation_fraction,
    verbose=False
)

# 519 je 80% 649, počet prvkov predanych v parametre Y a 
# y sa rovna a z neho použije 80% ako training a 
# 20% ako validation a tých 80% je maximalna posledna hodnota čo môže byť v training_size

train_sizes = [1,100,200,400,467] #649*0,90*0,80=467
train_sizes, train_scores, validation_scores = learning_curve(
    estimator = mlp,
    X = X2_train,
    y = T2_train, 
    train_sizes = train_sizes, 
    cv = k_cross_validation,
    scoring = 'neg_mean_squared_error'
)

print'Training scores:', train_scores
print'Validation scores:', validation_scores

train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
print'Mean training scores', pd.Series(train_scores_mean, index = train_sizes)
print'Mean validation scores',pd.Series(validation_scores_mean, index = train_sizes)

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a linear regression model', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0,0.5)

In [None]:
### Train & Predict (Scikit-learn MLPClassifier)

mlp.fit(X_train,T_train)
out_test = mlp.predict(X_test)
scores = mlp.score(X_test, T_test)

print(confusion_matrix(convert(T_test),convert(out_test)))
print(classification_report(T_test,out_test))
print"Success:", round(scores * 100, 2), "%"

In [None]:
### Compare the results with target data (Scikit-learn MLPClassifier)

T_test_converted = convert(T_test)
out_test_converted = convert(out_test)

plt.figure(figsize=(10,3))
plt.plot(T_test_converted,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output on test data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(out_test_converted,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on test data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

In [None]:
### Compare 10 random patterns (Scikit-learn MLPClassifier)

T_test_converted = convert(T_test)
out_test_converted = convert(out_test)
length = 10
chosen_out_test = []
chosen_T_test = []
rand_numbers = np.sort(np.random.randint(len(T_test_converted), size=(length)))
for i in rand_numbers:
    chosen_out_test.append(out_test_converted[i])
    chosen_T_test.append(T_test_converted[i])

plt.figure(figsize=(10,3))
plt.plot(chosen_T_test,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output on test data')
plt.xticks(np.arange(0, 10, 1)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(chosen_out_test,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on test data')
plt.xticks(np.arange(0, 10, 1)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

In [None]:
### Train & Validate (Scikit-learn LinearRegression)

mlp = LinearRegression()

# 519 je 80% 649, počet prvkov predanych v parametre Y a 
# y sa rovna a z neho použije 80% ako training a 
# 20% ako validation a tých 80% je maximalna posledna hodnota čo môže byť v training_size

train_sizes = [1,100,200,400,467] #649*0,90*0,80=467
train_sizes, train_scores, validation_scores = learning_curve(
    estimator = mlp,
    X = X2_train,
    y = T2_train, 
    train_sizes = train_sizes, 
    cv = k_cross_validation,
    scoring = 'neg_mean_squared_error'
)

print'Training scores:', train_scores
print'Validation scores:', validation_scores

train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)
print'Mean training scores', pd.Series(train_scores_mean, index = train_sizes)
print'Mean validation scores',pd.Series(validation_scores_mean, index = train_sizes)

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a linear regression model', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0,0.5)

In [None]:
### Train & Predict (Scikit-learn LinearRegression)

mlp.fit(X_train,T_train)
out_test = mlp.predict(X_test)
scores = mlp.score(X_test, T_test)

out_test = np.around(out_test)
accuracy = np.mean(out_test == T_test)

print(confusion_matrix(convert(T_test),convert(out_test)))
print(classification_report(T_test,out_test))

In [None]:
### Compare the results with target data (Scikit-learn LinearRegression)

T_test_converted = convert(T_test)
out_test_converted = convert(out_test)

plt.figure(figsize=(10,3))
plt.plot(T_test_converted,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output on test data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(out_test_converted,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on test data')
plt.xticks(np.arange(0, 130, 20)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

In [None]:
### Compare 10 random patterns (Scikit-learn LinearRegression)

T_test_converted = convert(T_test)
out_test_converted = convert(out_test)
length = 10
chosen_out_test = []
chosen_T_test = []
rand_numbers = np.sort(np.random.randint(len(T_test_converted), size=(length)))
for i in rand_numbers:
    chosen_out_test.append(out_test_converted[i])
    chosen_T_test.append(T_test_converted[i])

plt.figure(figsize=(10,3))
plt.plot(chosen_T_test,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Target output on test data')
plt.xticks(np.arange(0, 10, 1)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

plt.figure(figsize=(10,3))
plt.plot(chosen_out_test,'ro')
plt.ylabel('Class')
plt.xlabel('Sample')
plt.title('Simulation output on test data')
plt.xticks(np.arange(0, 10, 1)) 
plt.yticks(np.arange(1, 5, 1))
plt.grid(True)
plt.show()

In [None]:
### Error analysis - decision on which attributes to remove depending on mode of features
# if mode > 70% remove from dataset, otherwise it can remain

indices = []
for index in range(len(T_validation)):
        equal = np.array_equal(T_validation[index], out_validation[index])
        if not equal:
            indices.append(index)
            
notEqualdata = []
j = 0
for i in indices:
    notEqualdata.append(X_validation[i,:])
    j += 1
    
equaldataLength = len(notEqualdata)

notEqualdata = np.array(pd.DataFrame(notEqualdata))
invertedTable = min_max_scaler.inverse_transform(notEqualdata)

mode = []
modeCount = []
for i in range(16):
    column = invertedTable[:,i]
    mode.append(stats.mode(column))
    modeCount.append(stats.mode(column).count)

modeCount = np.array(modeCount).ravel()
modeCount = modeCount.astype(float)

modeTable = pd.DataFrame(mode)
modeTable = modeTable.astype(float)

relativeCount = [x / equaldataLength for x in modeCount]
modeTable = modeTable.assign(percentage = relativeCount)

removebooleans = [ x > 0.70 for x in relativeCount]
modeTable = modeTable.assign(remove = removebooleans)
print(modeTable)

In [None]:
### Error analysis - decision on which attributes to remove depending on mode of features
# if mode > 70% remove from dataset, otherwise it can remain

indices = []
for index in range(len(T_test)):
        equal = np.array_equal(T_test[index], out_test[index])
        if not equal:
            indices.append(index)
            
notEqualdata = []
j = 0
for i in indices:
    notEqualdata.append(X_test[i,:])
    j += 1
    
equaldataLength = len(notEqualdata)

notEqualdata = np.array(pd.DataFrame(notEqualdata))
invertedTable = min_max_scaler.inverse_transform(notEqualdata)

mode = []
modeCount = []
for i in range(16):
    column = invertedTable[:,i]
    mode.append(stats.mode(column))
    modeCount.append(stats.mode(column).count)

modeCount = np.array(modeCount).ravel()
modeCount = modeCount.astype(float)

modeTable = pd.DataFrame(mode)
modeTable = modeTable.astype(float)

relativeCount = [x / equaldataLength for x in modeCount]
modeTable = modeTable.assign(percentage = relativeCount)

removebooleans = [ x > 0.70 for x in relativeCount]
modeTable = modeTable.assign(remove = removebooleans)
print(modeTable)