# Import libraries

In [None]:
total = 217

import time
import numpy as np
from numpy.random import seed
sd = 30
seed(sd)

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU
from keras.layers import LeakyReLU, Input, Dropout
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/My Drive/partilhado/

dataset = pd.read_csv('dataset-final.csv', sep=',')

# Remove noise

In [None]:
ycounts = pd.Series(dataset['LABEL']).value_counts()
noise = ycounts[ycounts < 2].index
dataset = dataset[~dataset['LABEL'].isin(noise)]

# Class Distribution

In [None]:
ycounts = pd.Series(dataset['LABEL']).value_counts()
ybar = ycounts/sum(ycounts)
ybar.plot(kind = 'bar',figsize=(25,10))

In [None]:
ycounts = pd.Series(dataset['LABEL']).value_counts()
ybar = ycounts.cumsum()/sum(ycounts)
ybar.plot(kind = 'bar',figsize=(25,10))

# Classify 153 and rest

In [None]:
data = dataset.copy()
l = list(range(0,total))
l.remove(153)
data['LABEL'].replace(l, 0, inplace=True)
data['LABEL'].replace(153, 1, inplace=True)

ycounts = pd.Series(data['LABEL']).value_counts()
ybar = ycounts/sum(ycounts)
ax = ybar.plot(kind = 'bar', figsize = (15,10))
ax.set_xticklabels(['other', 153], rotation = 0, fontsize = 15)
ax.set_ylabel('Relative Frequency', fontsize = 15)
ax.set_xlabel('Labels', fontsize = 15)

df = data.to_numpy()
x = df[:, 1:]
y = df[:, 0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = sd)

In [None]:
st=time.time()
#model = LogisticRegression(n_jobs = -1, random_state = sd)
#model = SGDClassifier(n_jobs = -1, random_state = sd)
#model = RandomForestClassifier(n_estimators = 75, min_samples_split = 8, n_jobs = -1, random_state = sd)
model = MLPClassifier(hidden_layer_sizes = (30), early_stopping = True, random_state = sd)

kfold = StratifiedKFold(n_splits = 3)
results = cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'f1_weighted')
print(time.time()-st)

print('F1-score: {:4.2f}% (+-{:3.2f}%)'.format(results.mean()*100, results.std()*100))

In [None]:
model = MLPClassifier(hidden_layer_sizes = (30), early_stopping = True, random_state = sd).fit(x_train, y_train)

yt = model.predict(x_train)
print("Accuracy:", accuracy_score(y_train, yt))
print("F1-score:", f1_score(y_train, yt, average = 'weighted'))
        
yv = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, yv))
print("F1-score:", f1_score(y_test, yv, average = 'weighted'))
print(classification_report(y_test, yv, digits=3))

## Confusion Matrix

In [None]:
m = confusion_matrix(y_test, yv, normalize = 'true')
tick = ['other', 153]
fig, ax = plt.subplots(figsize = (30,15))
conf = sns.heatmap(m, cmap=plt.cm.Blues)
plt.ylabel('True Label', fontsize = 25)
plt.xlabel('Predicted Label', fontsize = 25)
conf.set_xticklabels(tick, fontsize = 25)
conf.set_yticklabels(tick, fontsize = 25)
conf.xaxis.tick_top()
conf.xaxis.set_label_position('top')

# Group over 50%

In [None]:
data = dataset.copy()
data = data[data['LABEL'] != 153]
ycounts = pd.Series(data['LABEL']).value_counts()
ybar = ycounts.cumsum()/sum(ycounts)

df = data.to_numpy()
x = df[:, 1:]
y = df[:, 0]

ykeys = ycounts.keys()

yother = []
for i in range(len(ycounts)):
  if ybar.get(key = ykeys[i]) > 0.5:
    yother.append(ykeys[i])

ylabels=[total if y[i] in yother else j for i,j in enumerate(y)]

ycounts = pd.Series(ylabels).value_counts()
ybar = ycounts/sum(ycounts)
ax = ybar.plot(kind = 'bar', figsize = (15,10))
ax.set_xticklabels(['other', 53, 32, 111], rotation = 0, fontsize = 15)
ax.set_ylabel('Relative Frequency', fontsize = 15)
ax.set_xlabel('Labels', fontsize = 15)

x_train, x_test, y_train, y_test = train_test_split(x, ylabels, test_size = 0.2, stratify = ylabels, random_state = sd)

lst1 = sorted(set(y_test))
lst1.remove(total)

In [None]:
st=time.time()
#model = LogisticRegression(multi_class = 'multinomial', n_jobs = -1, random_state = sd)
#model = SGDClassifier(loss = 'log', penalty = 'l1', alpha = 0.0001, n_jobs = -1, random_state = sd)
#model = RandomForestClassifier(n_estimators = 75, min_samples_split = 8, n_jobs = -1, random_state = sd)
model = MLPClassifier(hidden_layer_sizes = (50), alpha = 10e-10, early_stopping = True, random_state = sd)

kfold = StratifiedKFold(n_splits = 3)
results = cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'f1_weighted')
print(time.time()-st)

print('F1-score: {:4.2f}% (+-{:3.2f}%)'.format(results.mean()*100, results.std()*100))

In [None]:
model = MLPClassifier(hidden_layer_sizes = (50), alpha = 10e-10, early_stopping = True, random_state = sd).fit(x_train, y_train)

yt = model.predict(x_train)
print("Accuracy:", accuracy_score(y_train, yt))
print("F1-score:", f1_score(y_train, yt, average = 'weighted'))
        
yv = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, yv))
print("F1-score:", f1_score(y_test, yv, average = 'weighted'))
print(classification_report(y_test, yv, digits=3))

## Confusion Matrix

In [None]:
m = confusion_matrix(y_test, yv, normalize = 'true')
tick = lst1 + ['other']
fig, ax = plt.subplots(figsize=(30,15))
conf = sns.heatmap(m, cmap=plt.cm.Blues)
plt.ylabel('True Label', fontsize = 25)
plt.xlabel('Predicted Label', fontsize = 25)
conf.set_xticklabels(tick, fontsize = 25)
conf.set_yticklabels(tick, fontsize = 25)
conf.xaxis.tick_top()
conf.xaxis.set_label_position('top')

# Group over 70%

In [None]:
data = dataset.copy()
data = data[~data.LABEL.isin([153] + lst1)]
ycounts = pd.Series(data['LABEL']).value_counts()
ybar = ycounts.cumsum()/sum(ycounts)

df = data.to_numpy()
x = df[:, 1:]
y = df[:, 0]

ykeys = ycounts.keys()

yother = []
for i in range(len(ycounts)):
  if ybar.get(key = ykeys[i]) > 0.7:
    yother.append(ykeys[i])

ylabels=[total if y[i] in yother else j for i,j in enumerate(y)]

ycounts = pd.Series(ylabels).value_counts()
ybar = ycounts/sum(ycounts)
ax = ybar.plot(kind = 'bar', figsize = (15,10))
ax.set_xticklabels(['other', 176, 4, 143, 44, 145, 169, 167, 114, 25, 137, 2, 149], rotation = 0, fontsize = 13)
ax.set_ylabel('Relative Frequency', fontsize = 15)
ax.set_xlabel('Labels', fontsize = 15)

x_train, x_test, y_train, y_test = train_test_split(x, ylabels, test_size = 0.2, stratify = ylabels, random_state = sd)

lst2 = sorted(set(y_test))
lst2.remove(total)

In [None]:
st=time.time()
model = LogisticRegression(multi_class = 'multinomial', n_jobs = -1, random_state = sd)
#model = SGDClassifier(loss = 'log', penalty = 'l1', alpha = 0.0001, n_jobs = -1, random_state = sd)
#model = RandomForestClassifier(n_estimators = 75, min_samples_split = 8, n_jobs = -1, random_state = sd)
#model = MLPClassifier(hidden_layer_sizes = (75), alpha = 10e-10, early_stopping = True, random_state = sd)

kfold = StratifiedKFold(n_splits = 3)
results = cross_val_score(model, x_train, y_train, cv = kfold, scoring = 'f1_weighted')
print(time.time()-st)

print('F1-score: {:4.2f}% (+-{:3.2f}%)'.format(results.mean()*100, results.std()*100))

In [None]:
model = LogisticRegression(multi_class = 'multinomial', n_jobs = -1, random_state = sd).fit(x_train, y_train)

yt = model.predict(x_train)
print("Accuracy:", accuracy_score(y_train, yt))
print("F1-score:", f1_score(y_train, yt, average = 'weighted'))
        
yv = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, yv))
print("F1-score:", f1_score(y_test, yv, average = 'weighted'))
#print(classification_report(y_test, yv, digits=3))

## Confusion Matrix

In [None]:
m = confusion_matrix(y_test, yv, normalize = 'true')
tick = lst2 + ['other']
fig, ax = plt.subplots(figsize = (30,15))
conf = sns.heatmap(m, cmap=plt.cm.Blues)
plt.ylabel('True Label', fontsize = 25)
plt.xlabel('Predicted Label', fontsize = 25)
conf.set_xticklabels(tick, fontsize = 20)
conf.set_yticklabels(tick, fontsize = 20)
conf.xaxis.tick_top()
conf.xaxis.set_label_position('top')

# Classify the rest

In [None]:
data = dataset.copy()
cumlst = lst1 + lst2
data = data[~data.LABEL.isin([153] + cumlst)]
ycounts = pd.Series(data['LABEL']).value_counts()
ybar = ycounts/sum(ycounts)

ax = ybar.plot(kind = 'bar', figsize = (20,10))
ax.set_ylabel('Relative Frequency', fontsize = 15)
ax.set_xlabel('Labels', fontsize = 15)
ax.set_xticklabels(ycounts.keys(), fontsize = 12)

df = data.to_numpy()
x = df[:, 1:] #all columns except the first one (features)
y = df[:, 0] #only the first column (labels)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = sd)

In [None]:
from keras import backend as K

def f1_weighted(true, pred):

    predLabels = K.argmax(pred, axis=-1)
    pred = K.one_hot(predLabels, total) 

    actual_positives = K.sum(true, axis=0)       # = TP + FN
    pred_positives = K.sum(pred, axis=0)         # = TP + FP
    true_positives = K.sum(true * pred, axis=0)  # = TP

    precision = (true_positives + K.epsilon()) / (pred_positives + K.epsilon()) 
    recall = (true_positives + K.epsilon()) / (actual_positives + K.epsilon()) 
        #both = 1 if ground_positives == 0 or pred_positives == 0

    f1 = 2 * (precision * recall) / (precision + recall)

    weighted_f1 = f1 * actual_positives / K.sum(actual_positives)
    weighted_f1 = K.sum(weighted_f1)

    return weighted_f1

ytrain = to_categorical(y_train, num_classes = total)
ytest = to_categorical(y_test, num_classes = total)

In [None]:
kfold = StratifiedKFold(n_splits = 10)
cvscores = []

st=time.time()
for train, val in kfold.split(x_train, y_train):
  # create model
  model = Sequential()
  model.add(Dense(300, input_dim = 565, activation = 'relu'))
  model.add(LeakyReLU())
  model.add(Dropout(0.5))
  model.add(Dense(total, activation = 'softmax'))

  model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = [f1_weighted])
  model.fit(x_train[train], ytrain[train], batch_size = 400, epochs = 100, verbose = False)
	
  # evaluate the model
  scores = model.evaluate(x_train[val], ytrain[val], verbose = False)
  print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
  cvscores.append(scores[1]*100)
print(time.time()-st)
 
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
model = Sequential()
model.add(Dense(300, input_dim = 565, activation = 'relu'))
model.add(LeakyReLU())
model.add(Dropout(0.5))
model.add(Dense(total, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = [f1_weighted])
model.fit(x_train, ytrain, batch_size = 400, epochs = 100, verbose = False)

yt = np.argmax(model.predict(x_train), axis=-1)
print("Accuracy:", accuracy_score(y_train, yt))
print("F1-score:", f1_score(y_train, yt, average = 'weighted'))

yv = np.argmax(model.predict(x_test), axis=-1)
print("Accuracy:", accuracy_score(y_test, yv))
print("F1-score:", f1_score(y_test, yv, average = 'weighted'))
#print(classification_report(y_test, yv, digits=3))

## Confusion Matrix

In [None]:
m = confusion_matrix(y_test, yv, normalize = 'true')
tick = sorted(set(y_train))
fig, ax = plt.subplots(figsize = (40,20))
conf = sns.heatmap(m, cmap=plt.cm.Blues)
plt.ylabel('True Label', fontsize = 25)
plt.xlabel('Predicted Label', fontsize = 25)
conf.set_xticklabels(tick, fontsize = 12)
conf.set_yticklabels(tick, fontsize = 12, rotation = 0)
conf.xaxis.tick_top()
conf.xaxis.set_label_position('top')