# Import libraries

In [None]:
total = 217
import time
import numpy as np
from numpy.random import seed
sd = 30
seed(sd)

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU
from keras.layers import LeakyReLU, Input, Dropout
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/My Drive/partilhado/

dataset = pd.read_csv('dataset-final.csv', sep=',')

# Split Data

## Remove noise

In [None]:
ycounts = pd.Series(dataset['LABEL']).value_counts()
noise = ycounts[ycounts < 2].index
dataset = dataset[~dataset['LABEL'].isin(noise)]

In [None]:
df = dataset.to_numpy()
x = df[:, 1:] #all columns except the first one (features)
y = df[:, 0] #only the first column (labels)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = sd)

# Model

In [None]:
st=time.time()
#model = LogisticRegression(multi_class = 'multinomial', n_jobs = -1, random_state = sd).fit(x_train,y_train)
#model = RandomForestClassifier(n_estimators = 75, min_samples_split = 8, n_jobs = -1, random_state = sd).fit(x_train,y_train)
model = MLPClassifier(hidden_layer_sizes = (75), alpha = 10e-10, early_stopping = True, random_state = sd).fit(x_train,y_train)
print(time.time()-st)

## Predict

In [None]:
def predict(alg, sample, return_prob=False):
  x = sample.reshape(1,-1)
  probs = alg.predict_proba(x)[0]
  idx = np.argsort(-probs)[:3]
  labels = alg.classes_

  if return_prob:
    return [labels[idx].tolist(),probs[idx].tolist()]

  return labels[idx]

In [None]:
xt = x_test[0]
yt = y_test[0]

print(predict(model, xt, return_prob=True))
print(yt)

## Evaluate

In [None]:
def evaluate(alg, xset, yset):
  tp = 0
  nrow = xset.shape[0] 
  for i in range(nrow):
    labels = predict(alg,xset[i])
    if yset[i] in labels:
      tp += 1
  return tp/nrow

In [None]:
evaluate(model, x_test, y_test)

# Model NN

In [None]:
from keras import backend as K

def f1_weighted(true, pred):

    predLabels = K.argmax(pred, axis=-1)
    pred = K.one_hot(predLabels, total) 

    actual_positives = K.sum(true, axis=0)       # = TP + FN
    pred_positives = K.sum(pred, axis=0)         # = TP + FP
    true_positives = K.sum(true * pred, axis=0)  # = TP

    precision = (true_positives + K.epsilon()) / (pred_positives + K.epsilon()) 
    recall = (true_positives + K.epsilon()) / (actual_positives + K.epsilon()) 
        #both = 1 if ground_positives == 0 or pred_positives == 0

    f1 = 2 * (precision * recall) / (precision + recall)

    weighted_f1 = f1 * actual_positives / K.sum(actual_positives)
    weighted_f1 = K.sum(weighted_f1)

    return weighted_f1

In [None]:
ytrain = to_categorical(y_train, num_classes = total)
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=False)

st=time.time()
model_NN = Sequential()
model_NN.add(Dense(300, input_dim=565, activation='relu'))
model_NN.add(Dropout(0.2))
model_NN.add(LeakyReLU())
model_NN.add(Dropout(0.3))
model_NN.add(Dense(total, activation='softmax'))

model_NN.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = [f1_weighted])
model_NN.fit(x_train, ytrain, batch_size = 1000, epochs = 150, verbose = False)
print(time.time()-st)

## Predict

In [None]:
def predict_NN(alg, sample, return_prob=False):
  x = sample.reshape(1,-1)
  probs = alg.predict(x)[0]
  labels = np.argsort(-probs)[:3]

  if return_prob:
    return [labels.tolist(),probs[labels].tolist()]

  return labels

In [None]:
xt = x_test[0]
yt = y_test[0]

print(predict_NN(model_NN, xt, return_prob=True))
print(yt)

## Evaluate

In [None]:
def evaluate_NN(alg, xset, yset):
  tp = 0
  nrow = xset.shape[0]
  for i in range(nrow):
    labels = predict_NN(alg,xset[i])
    if yset[i] in labels:
      tp += 1
  return tp/nrow

In [None]:
evaluate_NN(model_NN, x_test, y_test)

# Multi-step Model

In [None]:
from keras import backend as K

def f1_weighted(true, pred):

    predLabels = K.argmax(pred, axis=-1)
    pred = K.one_hot(predLabels, total) 

    actual_positives = K.sum(true, axis=0)       # = TP + FN
    pred_positives = K.sum(pred, axis=0)         # = TP + FP
    true_positives = K.sum(true * pred, axis=0)  # = TP

    precision = (true_positives + K.epsilon()) / (pred_positives + K.epsilon()) 
    recall = (true_positives + K.epsilon()) / (actual_positives + K.epsilon()) 
        #both = 1 if ground_positives == 0 or pred_positives == 0

    f1 = 2 * (precision * recall) / (precision + recall)

    weighted_f1 = f1 * actual_positives / K.sum(actual_positives)
    weighted_f1 = K.sum(weighted_f1)

    return weighted_f1

In [None]:
def models(x,y):
  model = []

  #PASSO 1
  y1 = np.where(y != 153, total, y)
  clf1 = MLPClassifier(hidden_layer_sizes = (50), early_stopping = True, random_state = sd).fit(x,y1)
  model.append(clf1)
  
  #PASSO 2
  ind = np.nonzero(y != 153)
  x2 = x[ind]
  y2 = y[ind]
  keys, counts = np.unique(y2, return_counts=True)
  ind = np.argsort(-counts)
  keys = keys[ind]
  counts = counts[ind]
  ybar = counts.cumsum()/sum(counts)
  other = []
  for i in range(len(keys)):
    if ybar[i] > 0.5:
      other.append(keys[i])
  y2 = np.where(np.isin(y2,other), total, y2)
  clf2 = MLPClassifier(hidden_layer_sizes = (75), alpha = 10e-10, early_stopping = True, random_state = sd).fit(x2,y2)
  model.append(clf2)

  #PASSO 3
  ind = np.nonzero(np.isin(y, other))
  x3 = x[ind]
  y3 = y[ind]
  keys, counts = np.unique(y3, return_counts = True)
  ind = np.argsort(-counts)
  keys = keys[ind]
  counts = counts[ind]
  ybar = counts.cumsum()/sum(counts)
  other = []
  for i in range(len(keys)):
    if ybar[i] > 0.7:
      other.append(keys[i])
  y3 = np.where(np.isin(y3, other), total, y3)
  clf3 = LogisticRegression(multi_class = 'multinomial', n_jobs = -1, random_state = sd).fit(x3,y3)
  model.append(clf3)

  #PASSO 4
  ind = np.nonzero(np.isin(y, other))
  x4 = x[ind]
  y4 = y[ind]
  y4 = to_categorical(y4, num_classes = total)

  clf4 = Sequential()
  clf4.add(Dense(300, input_dim = 565, activation = 'relu'))
  clf4.add(LeakyReLU())
  clf4.add(Dropout(0.5))
  clf4.add(Dense(total, activation = 'softmax'))

  clf4.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = [f1_weighted])
  clf4.fit(x4, y4, batch_size = 400, epochs = 100, verbose = False)
  model.append(clf4)

  return model

m_steps = models(x_train,y_train)

## Predict

In [None]:
def predict_steps(algs, sample, return_prob=False):
  probs = {}
  m1,m2,m3,m4 = algs
  x = sample.reshape(1,-1)
  
  p1 = m1.predict_proba(x)[0]
  probs['153'] = p1[0]
  p = p1[1]

  p2 = m2.predict_proba(x)[0]
  lb2 = m2.classes_[:-1]
  for i in range(len(lb2)):
    probs[str(lb2[i])] = p2[i]*p
  p = p2[-1]*p

  p3 = m3.predict_proba(x)[0]
  lb3 = m3.classes_[:-1]
  for i in range(len(lb3)):
    probs[str(lb3[i])] = p3[i]*p
  p = p3[-1]*p

  p4 = m4.predict(x)[0]
  l = np.concatenate((np.array([153]), lb2, lb3), axis=None)
  lb4 = np.setdiff1d(np.arange(total),l)
  for i in range(len(lb4)):
    probs[str(lb4[i])] = p4[i]*p

  k = sorted(probs, key=probs.get, reverse=True)[:3]
  top = [int(i) for i in k]

  if return_prob:
    p = [probs[x] for x in k]
    return [top,p]

  return top

In [None]:
xt = x_test[15]
yt = y_test[15]

print(predict_steps(m_steps, xt, return_prob=True))
print(yt)

## Evaluate

In [None]:
def evaluate_steps(algs, xset, yset):
  tp = 0
  nrow = xset.shape[0] 
  for i in range(nrow):
    labels = predict_steps(algs, xset[i])
    if yset[i] in labels:
      tp += 1
  return tp/nrow

In [None]:
evaluate_steps(m_steps, x_test, y_test)

In [None]:
def acc(algs, xset, yset):
  tp = 0
  nrow = xset.shape[0] 
  for i in range(nrow):
    labels = predict_steps(algs, xset[i])
    if yset[i] == labels[0]:
      tp += 1
  return tp/nrow

acc(m_steps, x_test, y_test)

In [None]:
def f1_score(algs, xset, yset, epsilon=10e-5):
  tp = np.zeros(total)
  fn = np.zeros(total)
  fp = np.zeros(total)
  nrow = xset.shape[0]

  for i in range(nrow):
    true = yset[i]
    pred = predict_steps(algs, xset[i])[0]

    if true == pred:
      tp[true] += 1
    else:
      fn[true] += 1
      fp[pred] += 1

  totaltrue = tp + fn
  totalpred = tp + fp
  precision = (tp + epsilon) / (totalpred + epsilon)
  recall = (tp + epsilon) / (totaltrue + epsilon)

  f1 = 2 * (precision * recall) / (precision + recall)

  weighted_f1 = f1 * (totaltrue / np.sum(totaltrue))
  weighted_f1 = np.sum(weighted_f1)

  return weighted_f1

f1_score(m_steps, x_test, y_test)