<a href="https://colab.research.google.com/github/jvsamonek/TCC_FMA_IMBALANCE/blob/main/TCC_FMA_ORIGINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Iniciador de ambiente

In [None]:
#!ls "drive/My Drive/TCC/fma90k/ARFFs"

# Load the Drive helper and mount
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

from scipy.io import arff
import pandas as pd
from scipy.sparse import csr_matrix
import re 
import numpy as np
import os
import itertools
import collections
import random

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
!pip install scikit-multilearn
from skmultilearn.ensemble import RakelO
import sklearn.metrics as metrics
from skmultilearn.adapt import MLkNN
from sklearn.tree import DecisionTreeClassifier

from IPython.display import clear_output
import matplotlib.pyplot as plt
from tempfile import TemporaryFile

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Declaração de funções de apoio

In [None]:
#classe para validação
"""Classe de validador da base, que possui funções para retornar a média de balanceamento da base.
"""
class validatorB:
  def __init__(self):
    self.y = np.array([])
    self.full_label_set = []
    self.labels = []

  def sum_h(self,l): 
    h_sum=0
    for label_set in self.y: 
      if(label_set[l] == 1):
        h_sum += 1
    return h_sum 

  def get_imbalance_ratio_per_label(self,l):
    sum_array=list(map(self.sum_h,self.full_label_set))
    sum_array=np.array(sum_array)
    return sum_array.max()/self.sum_h(l)

  def meanIR(self,y,D):
    self.y = y.toarray()
    _, self.labels = getCaracteristicas(D)
    self.full_label_set = np.arange(len(labels))
    ratio_sum=np.sum(np.array(list(map(self.get_imbalance_ratio_per_label,self.full_label_set))))
    return ratio_sum/self.full_label_set.shape[0]

#Funçoes de apoio

"""Retorna features da base.

Entradas:
D = Dataframe com a base completa
"""
def getCaracteristicas(D):
  listOfColumns = D.columns.to_list()
  r = re.compile("feature.*")
  #r = re.compile("[A-Z].*")
  features = list(filter(r.match, listOfColumns)) # Read Note
  labels = (list(set(listOfColumns) - set(features)))
  return features, labels

"""Retorna cardinalidade de rótulo da base.

Entradas:
D = Dataframe com a base completa
"""
def cardinalidadeDeRotulo(D):
  sizeYi = 0
  N = D.shape[0]
  features, labels = getCaracteristicas(D)
  auxD = D[:][labels].values
  for i in auxD:
    sizeYi += np.count_nonzero(i == 1)
  return (1/N) * sizeYi

"""Retorna densidade de rótulo da base.

Entradas:
D = Dataframe com a base completa
"""
def densidadeDeRotulo(D):
  sizeYi = 0
  N = D.shape[0]
  features, labels = getCaracteristicas(D)
  Nr = len(labels)
  auxD = D[:][labels].values
  for i in auxD:#range(0, D.shape[0]):
    sizeYi += np.count_nonzero(i == 1)/Nr#(df.iloc[[i]][labels].values == b'1')/Nr
  return (1/N) * sizeYi


"""Retorna hamming loss da predição, partindo das predições retornadas por um classificador.

Entradas:
D = Dataframe com a base completa
predictions = saída do classíficador
y_test = resultados reais
"""
def hammingLoss(D, predictions, y_test):
  sumHL = 0
  N = predictions.shape[0]
  features, labels = getCaracteristicas(D)
  Nr = len(labels)
  for i in range(0, predictions.shape[0]):
    a = predictions[i].astype(int)
    b = y_test[i].toarray().astype(int)
    c = csr_matrix(np.bitwise_xor(a, b))
    tamanhoDaDiferenca = np.count_nonzero(c.toarray() == 1)
    sumHL += tamanhoDaDiferenca/Nr
  return (1/N) * sumHL

"""Retorna acuracia da predição, partindo das predições retornadas por um classificador.

Entradas:
D = Dataframe com a base completa
predictions = saída do classíficador
y_test = resultados reais
"""
def acuracia(D, predictions, y_test):
  cum = 0
  N = predictions.shape[0]
  features, labels = getCaracteristicas(D)
  Nr = len(labels)
  for i in range(0, predictions.shape[0]):
    a = predictions[i].astype(int)
    b = y_test[i].toarray().astype(int)
    union = np.count_nonzero(csr_matrix(np.bitwise_or(a, b)).toarray() == 1)
    inter = np.count_nonzero(csr_matrix(np.bitwise_and(a, b)).toarray() == 1)
    cum += inter/union
  return (1/N) * cum

"""Retorna precisao da predição, partindo das predições retornadas por um classificador.

Entradas:
D = Dataframe com a base completa
predictions = saída do classíficador
y_test = resultados reais
"""
def precisao(D, predictions, y_test):
  cum = 0
  N = predictions.shape[0]
  features, labels = getCaracteristicas(D)
  Nr = len(labels)
  for i in range(0, predictions.shape[0]):
    a = predictions[i].astype(int)
    b = y_test[i].toarray().astype(int)
    Ri = np.count_nonzero(a == 1)
    inter = np.count_nonzero(csr_matrix(np.bitwise_and(a, b)).toarray() == 1)
    if(Ri>0):
      cum += inter/Ri
    else:
      cum += 0
  return (1/N) * cum

#Funçoes para manter base
"""Retira instâncias com generos de pouca relevãncia, retornando um dataframe reamostrado e rótulos restantes.

Entradas:
N = Quantidade minima que cada classe deve ter de instâncias
DF = Dataframe com a base completa 
"""
def dropGenresWithLessThan(N, DF):
  genre_ranking = {}
  for genre in DF[labels].columns:
    genre_ranking[genre] = 0
    for track in DF[genre].values:
      if(track == 1):
        genre_ranking[genre] = genre_ranking[genre] + 1
  tuples_genre_ranking = sorted(genre_ranking.items(), key=lambda x: x[1], reverse=True)
  while(True):
    if(tuples_genre_ranking[-1][1] < N):
      to_drop = DF[DF[tuples_genre_ranking[-1][0]] != 0]
      DF = DF.drop(to_drop.index)
      #print(to_drop)
      DF = DF.drop(columns=[tuples_genre_ranking[-1][0]])
      
      #print('removendo: '+ tuples_genre_ranking[-1][0])
      labels.remove(tuples_genre_ranking[-1][0])
      tuples_genre_ranking = tuples_genre_ranking[:-1]
    else:
      break
  print(labels)
  return DF, labels

  #metricas

"""Mostra na tela métricas obtidas com base no sciKit.

Entradas:
predictions = saída do classíficador
y_test = resultados reais
"""
def sciMetricas(y_test, predictions):
  print("Metrica do pacote do sklearn:")
  #part_acc=metrics.accuracy_score(y_test, predictions)
  part_prec=metrics.precision_score(y_test, predictions, average = 'micro')
  #part_recall=metrics.recall_score(y_test, predictions, average = 'micro')
  #part_hamm=metrics.hamming_loss(y_test,predictions)
  #print('Classifier accuracy score:',round(part_acc,3))
  print('Classifier precision score:',round(part_prec,3))
  #print('Classifier recall score:',round(part_recall,3))
  #print('Classifier Hamming Loss:',round(part_hamm,3))

"""Mostra na tela métricas obtidas com base no desenvolvido para o projeto.

Entradas:
predictions = saída do classíficador
y_test = resultados reais
"""
def minhasMetricas(df, y_test, predictions):
  print("Metrica exposta na proposta:")
  #part_acc=acuracia(df, predictions, y_test)#metrics.accuracy_score(y_test, predictions)
  part_prec=precisao(df, predictions, y_test)#metrics.precision_score(y_test, predictions, average = 'micro')

  #part_recall=metrics.recall_score(y_test, predictions, average = 'micro')

  #part_hamm=hammingLoss(df, predictions, y_test)#metrics.hamming_loss(y_test,predictions)
  #print('Classifier accuracy score:',round(part_acc,3))
  print('Classifier precision score:',round(part_prec,3))
  #print('Classifier recall score:',round(part_recall,3))
  #print('Classifier Hamming Loss:',round(part_hamm,3))

# Declaração de algoritmos de Balanceamento

MLROS

In [None]:

class MLROS:
  def __init__(self):
    self.labelsInDataset = []
    self.instances=[]
    self.features=[]

  """Retorna X e y reamostrados.

  Entradas:
  D = Dataframe com a base completa
  X = Features de cada instância
  y = Classes multirrótulo de cada instância
  """
  def fit_resample(self,D,X,y,P):
    #preparação das variáveis
    samplesToClone = int((len(X)/100)*P)
    print(samplesToClone)
    _, labelList = getCaracteristicas(D)
    self.labelsInDataset = np.arange(len(labelList))
    self.instances = y.toarray()
    #calculo da média de balanceamento
    mean_ir= self.get_mean_imbalance_ratio()
    if isinstance(X,np.ndarray):
      self.features = X
    else:
      self.features = X.values
    print("MeanIR Original: ",mean_ir)
    #usando a média de balanceamento, achar qual as bags minoritárias
    min_bags={}
    for label in self.labelsInDataset:
      print(f"Fazendo bag da label {label}")
      irlbl=self.get_imbalance_ratio_per_label(label)
      if irlbl > mean_ir:
        min_bags[label] = self.get_all_instances_of_label(label)
    while samplesToClone > 0:
      #clone a random sample from each minority bag
      for label in list(min_bags.keys()):
        print(min_bags[label])
        x = random.randint(1,len(min_bags[label]))
        print(x)
        self.features = np.insert(self.features,len(self.features),self.features[x],axis = 0)
        self.instances = np.insert(self.instances,len(self.instances),self.instances[x],axis = 0)
        if(self.get_imbalance_ratio_per_label(label)<=mean_ir):
          min_bags.pop(label)
        samplesToClone = samplesToClone - 1
        clear_output(wait=True)
        print(samplesToClone)
    return np.array(self.features), csr_matrix(self.instances)
        

  def get_all_instances_of_label(self,label):
    instance_ids=[]
    append_instance_id=instance_ids.append
    for i,label_set in enumerate(self.instances):
      if(label_set[label] == 1):
        append_instance_id(i)
    return np.array(instance_ids)

  def get_mean_imbalance_ratio(self):
    ratio_sum=np.sum(np.array(list(map(self.get_imbalance_ratio_per_label,self.labelsInDataset))))
    return ratio_sum/self.labelsInDataset.shape[0]

  def get_imbalance_ratio_per_label(self,l):
    sum_h_dataset=list(map(self.sum_h,self.labelsInDataset))
    sum_h_dataset=np.array(sum_h_dataset)
    return sum_h_dataset.max()/self.sum_h(l)

  def sum_h(self,l): 
    h_sum=0
    for label_set in self.instances: 
      h_sum += label_set[l] # se a instancia for da classe 'X', o array na posição 'X' sera 1, senão será 0
    return h_sum


MLRUS

In [None]:
class MLRUS:
  def __init__(self):
    self.labelsInDataset = []
    self.instances=[]
    self.features=[]

  """Retorna X e y reamostrados.

  Entradas:
  D = Dataframe com a base completa
  X = Features de cada instância
  y = Classes multirrótulo de cada instância
  """
  def fit_resample(self,D,X,y,P):
    #preparação das variáveis
    samplesToRemove = int((len(X)/100)*P)
    print(samplesToRemove)
    _, labelList = getCaracteristicas(D)
    self.labelsInDataset = np.arange(len(labelList))
    self.instances = y.toarray()
    #calculo da média de balanceamento
    mean_ir= self.get_mean_imbalance_ratio()
    if isinstance(X,np.ndarray):
      self.features = X
    else:
      self.features = X.values
    print("MeanIR Original: ",mean_ir)
    #usando a média de balanceamento, achar qual as bags majoritárias
    max_bags={}
    for label in self.labelsInDataset:
      #clear_output(wait=True)
      print(f"Fazendo bag da label {label}")
      irlbl=self.get_imbalance_ratio_per_label(label)
      if irlbl < mean_ir:
        max_bags[label] = self.get_all_instances_of_label(label)
    while samplesToRemove > 0:
      #remover uma instancia de cada bag majoritária
      for label in list(max_bags.keys()):
        x = random.randint(1,len(max_bags[label]))
        self.features = np.delete(self.features, x, axis=0)
        self.instances = np.delete(self.instances, x, axis=0)
        if(self.get_imbalance_ratio_per_label(label)>=mean_ir):
          max_bags.pop(label)
        samplesToRemove = samplesToRemove - 1
        clear_output(wait=True)
        print(samplesToRemove)
    return np.array(self.features), csr_matrix(self.instances)
        

  def get_all_instances_of_label(self,label):
    instance_ids=[]
    append_instance_id=instance_ids.append
    for i,label_set in enumerate(self.instances):
      if(label_set[label] == 1):
        append_instance_id(i)
    return np.array(instance_ids)

  def get_mean_imbalance_ratio(self):
    ratio_sum=np.sum(np.array(list(map(self.get_imbalance_ratio_per_label,self.labelsInDataset))))
    return ratio_sum/self.labelsInDataset.shape[0]

  def get_imbalance_ratio_per_label(self,l):
    sum_h_dataset=list(map(self.sum_h,self.labelsInDataset))
    sum_h_dataset=np.array(sum_h_dataset)
    return sum_h_dataset.max()/self.sum_h(l)

  def sum_h(self,l): 
    h_sum=0
    for label_set in self.instances: 
      h_sum += label_set[l] # se a instancia for da classe 'X', o array na posição 'X' sera 1, senão será 0
    return h_sum


MLSMOTE

In [None]:
class MLSMOTE:
  def __init__(self,k):
    self.k=k
    self.labelsInDataset = []
    self.instances=[]
    self.features=[]

  """Retorna X e y reamostrados.

  Entradas:
  D = Dataframe com a base completa
  X = Features de cada instância
  y = Classes multirrótulo de cada instância
  """
  def fit_resample(self,D,X,y,k):
    print(X)
    ##L <- labelsInDataset(D) > Full set of labels
    _, labelList = getCaracteristicas(D)
    self.labelsInDataset = np.arange(len(labelList))#np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
    self.instances = y.toarray()#np.array([np.array(xi) for xi in y])
    ##MeanIR <- calculateMeanIR(D; L)
    mean_ir= self.get_mean_imbalance_ratio()
    if isinstance(X,np.ndarray):
      self.features = X
    else:
      self.features = X.values
    X_synth=[]
    y_synth=[]
    append_X_synth=X_synth.append
    append_y_synth=y_synth.append
    print("MeanIR Original: ",mean_ir)
    ##for each label in L do
    for label in self.labelsInDataset:
      print("label: ",label)
      ##IRLbllabel <- calculateIRperLabel(D; label)
      irlbl=self.get_imbalance_ratio_per_label(label)
      print("irplbl: ",irlbl)
      print(' ')
      ##if IRLbllabel > MeanIR then
      if irlbl > mean_ir:
        ##> Bags of minority labels samples
        ##minBag <- getAllInstancesOfLabel(label)
        min_bag=self.get_all_instances_of_label(label)
        ##for each sample in minBag do
        for sample in min_bag:
          ##distances <- calcDistance(sample, minBag)
          distances=self.calc_distances(sample,min_bag)
          distances=np.sort(distances,order='distance')
          neighbours=distances[1:k+1]
          ref_neigh=np.random.choice(neighbours)
          X_new,y_new=self.create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours])
          append_X_synth(X_new)
          append_y_synth(y_new)
    X_synth = list(self.features) + list(X_synth)
    y_synth = np.array(list(y.toarray()) + list(y_synth))
    return np.array(X_synth),csr_matrix(np.array(y_synth))

  def create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids):
    sample=self.features[sample_id]
    sample_labels=self.instances[sample_id]

    synth_sample=np.zeros(sample.shape[0])
    ref_neigh=self.features[ref_neigh_id]
    for i in range(synth_sample.shape[0]):
      #if f is numeric todo:implement nominal support
      diff=ref_neigh[i]-sample[i]
      offset=diff*random.uniform(0,1)
      synth_sample[i]=sample[i]+offset

    neighbours_labels=[]
    for ni in neighbour_ids:
      neighbours_labels.append(self.instances[ni].tolist())
    reference_labels = neighbours_labels
    reference_labels.append(sample_labels.tolist())
    labels=np.zeros(len(sample_labels))
    #print(reference_labels)

    
    for j in range(0, len(labels)):
      occurences = 0
      for i in reference_labels:
        if(i[j] == 1):
          occurences += 1
      if(occurences > (self.k+ 1)/2):
        labels[j] = 1

    y=labels
    X=synth_sample
    return X,y


  def calc_distances(self,sample,min_bag):
    distances=[]
    append_distances=distances.append
    for bag_sample in min_bag:
      #if f is numeric todo:implement nominal support
      # print('')
      # print(self.features[sample])
      # print(self.features[bag_sample])
      # print('')
      append_distances((np.linalg.norm(self.features[sample]-self.features[bag_sample]),bag_sample))
    dtype =  np.dtype([('distance', float), ('index', int)])
    return np.array(distances,dtype=dtype)

  def get_all_instances_of_label(self,label):
    instance_ids=[]
    append_instance_id=instance_ids.append
    for i,label_set in enumerate(self.instances):
      if(label_set[label] == 1):
        append_instance_id(i)
    return np.array(instance_ids)

  def get_mean_imbalance_ratio(self):
    ratio_sum=np.sum(np.array(list(map(self.get_imbalance_ratio_per_label,self.labelsInDataset))))
    return ratio_sum/self.labelsInDataset.shape[0]

  def get_imbalance_ratio_per_label(self,l):
    sum_h_dataset=list(map(self.sum_h,self.labelsInDataset))
    sum_h_dataset=np.array(sum_h_dataset)
    return sum_h_dataset.max()/self.sum_h(l)

  def sum_h(self,l): 
    h_sum=0
    for label_set in self.instances: 
      h_sum += label_set[l] # se a instancia for da classe 'X', o array na posição 'X' sera 1, senão será 0
    return h_sum


  def get_value_counts(self,labels):
    count_map=np.array(np.unique(labels, return_counts=True)).T
    counts=np.array([x[1] for x in count_map])
    return counts

# Consultar e formatar FMA Original
Código sequencial para retornar a base FMA guardada no drive do ambiente e tratar ela para que seja usável nas próximas etapas. 

Carregar e guardar variaves com informações sobre a base

In [None]:
#Carregar base com todas as features
features_df = pd.read_csv('drive/My Drive/TCC/fma_metadata/features.csv').drop([0, 1, 2])
#Carregar base com todas as trilhas
tracks = pd.read_csv('drive/My Drive/TCC/fma_metadata/tracks.csv')
#Tirar colunas que não serão usadas e renomear as restantes
track_genres = tracks[['Unnamed: 0','track.8']].drop([0,1]).rename(columns = {"Unnamed: 0": "track_id", "track.8" : "genres"})
#Obter nomes dos generos existentes na base
genres = pd.read_csv('drive/My Drive/TCC/fma_metadata/genres.csv')[['genre_id','title']]
#Tratar gêneros nas trilhas para qu sejam formatados e fiquem compativeis com ferramentas SciKit, com uma coluna binária para cada gênero
labels_df = track_genres.iloc[:,1].str.replace(' ','').str.replace('[','').str.replace(']','').str.get_dummies(sep=',')
#Montar um dicionario dos gêneros para renomear colunas individuais para cada gênero
dict_genres = {}
for column in labels_df.columns:
  dict_genres[column] = genres.loc[genres['genre_id'] == int(column)]['title'].values[0]
labels_df = labels_df.rename(columns = dict_genres)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Quebrar a base em features e labels

In [None]:
dict_ft_names = {}
ft_number = 0
#Selecionar quais tipos de features serão usados, o artigo da FMA recomenda 4 e 8
feature_types = []
for x in features_df.columns:
  feature_types.append(x.split('.')[0])
feature_types = list(dict.fromkeys(feature_types))
print("Quais tipos de features gostaria de usar?(em caso de multiplos, separar por espaço)")
print("1 - 'chroma_cens'\n2 -'chroma_cqt'\n3 -'chroma_stft\n4 - 'mfcc'\n5 - 'rmse'\n6 - 'spectral_bandwidth'\n7 - 'spectral_centroid'\n8 - 'spectral_contrast'\n9 - 'spectral_rolloff'\n10 - 'tonnetz'\n11 - 'zcr'")
features_input = input()
#Obter somente os gêneros desejados
list_of_features = []
for i in features_input.split():
  temp = [j for j in features_df.columns if feature_types[int(i)] in j]
  list_of_features.append(temp)
list_of_features = [item for sublist in list_of_features for item in sublist]
list_of_features.insert(0,'feature')
features_df = features_df[list_of_features]
#renomear features e enumera-los. sendo o primeiro o feature0 até o ultimo featureX
for column in features_df.columns:
  dict_ft_names[column] = 'feature' + str(ft_number)
  ft_number = ft_number + 1
features_df = features_df.rename(columns = dict_ft_names)

features_df = features_df.reset_index(drop=True)
labels_df = labels_df.reset_index(drop=True)

Quais tipos de features gostaria de usar?(em caso de multiplos, separar por espaço)
1 - 'chroma_cens'
2 -'chroma_cqt'
3 -'chroma_stft
4 - 'mfcc'
5 - 'rmse'
6 - 'spectral_bandwidth'
7 - 'spectral_centroid'
8 - 'spectral_contrast'
9 - 'spectral_rolloff'
10 - 'tonnetz'
11 - 'zcr'
4 8


Finalizar as normalizações na base para ser usada juntamente ao SciKit

In [None]:
#Montar dataframe geral e forçar que todos os campos sejam numéricos, evitando erros nas ferramentas scikit
df = pd.concat([features_df, labels_df], axis=1)
for column in df:
  df[column] =  pd.to_numeric(df[column], errors='coerce')

#Cortar dataframe final em Features e Labels, cada linha representando uma trilha
list_of_columns = df.columns.to_list()
r = re.compile("feature.*")
features = list(filter(r.match, list_of_columns)) # Read Note
features.remove('feature0')
labels = (list(set(list_of_columns) - set(features)))
labels.remove('feature0')
#tirar generos com menos de X instâncias, 9 é o menor gênero existente na FMA
df, labels = dropGenresWithLessThan(5, df)
genderless_rows_to_drop = []
for index, row in df.iterrows():
    if(not row[labels].values.any()):
      genderless_rows_to_drop.append(index)
df = df.drop(genderless_rows_to_drop)
#formatações finais para tirar colunas que não serão relevantes para a classificação (ID da musica)
track_ids_df = df['feature0']
df = df.drop(columns=['feature0'])
#guardar base em X e y, pronto pra usar nas ferramentas SciKit
myfunc_vec = np.vectorize(int)
y = myfunc_vec(df[labels].values)
y = csr_matrix(y)
X = df[features]
v = validatorB()

['Minimal Electronic', 'Holiday', 'Progressive', 'Salsa', 'Bigbeat', 'Techno', 'Polka', 'Modern Jazz', 'Reggae - Dancehall', 'Jazz', 'Chiptune', 'Unclassifiable', 'Latin America', 'Abstract Hip-Hop', 'Chip Music', 'Loud-Rock', 'Trip-Hop', 'Thrash', 'New Age', 'Nerdcore', 'Chamber Music', 'Sound Collage', 'Sound Poetry', 'Psych-Folk', 'Skweee', 'Novelty', 'Compilation', 'Nu-Jazz', 'Instrumental', 'Hip-Hop', 'Improv', 'Surf', 'Kid-Friendly', 'Noise', 'Reggae - Dub', 'Free-Folk', 'Minimalism', 'Afrobeat', 'Singer-Songwriter', 'Symphony', 'Downtempo', 'Chill-out', 'Comedy', 'Sludge', 'Dance', 'Experimental Pop', 'Lo-Fi', 'Space-Rock', 'Lounge', 'Goth', 'Post-Punk', 'Glitch', 'Spanish', 'Free-Jazz', 'Noise-Rock', 'Alternative Hip-Hop', 'Easy Listening: Vocal', 'British Folk', 'Folk', 'Psych-Rock', 'Audio Collage', 'Garage', 'Freak-Folk', 'Celtic', 'Ambient', 'Drone', 'Gospel', 'Europe', 'Blues', 'Jazz: Out', 'Breakcore - Hard', 'Funk', 'Balkan', 'Drum & Bass', 'No Wave', 'Fado', 'Bluegrass'

Caracteristicas gerais da base

In [None]:
print("O numero de rótulos na base é:", len(labels))
print("A cardinalidade de rótulo é:", cardinalidadeDeRotulo(df))
print("A densidade de rótulo é:", densidadeDeRotulo(df))
#print("A média da razão de desbalanceamento (MeanIR) é:", v.meanIR(y,df))

O numero de rótulos na base é: 158
A cardinalidade de rótulo é: 2.4329844537734586
A densidade de rótulo é: 0.015398635783359307


#Balanceamento

In [None]:
mlros = MLROS()
X_train_MLROS, y_train_MLROS = mlros.fit_resample(df,X_train, y_train, 4.76)
meanIR_MLROS = v.meanIR(y_train_MLROS, df)
print("MeanIR resampled:", meanIR_MLROS)
print(f"Tamanho Original:{len(X_train)}")
print(f"Tamanho resampled:{len(X_train_MLROS)}")

Uso do MLRUS

In [None]:
mlrus = MLRUS()
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train_MLRUS, y_train_MLRUS = mlrus.fit_resample(df,X_train, y_train, 5.26)
meanIR_MLRUS = v.meanIR(y_train_MLRUS, df)
print("MeanIR resampled:", meanIR_MLRUS)
print(f"Tamanho Original:{len(X_train)}")
print(f"Tamanho resampled:{len(X_train_MLRUS)}")

Uso do MLSMOTE

In [None]:
mlsmote = MLSMOTE(3)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train_MLSMOTE, y_train_MLSMOTE = mlsmote.fit_resample(df,X_train, y_train, 3)
meanIR_MLSMOTE = v.meanIR(y_train_MLSMOTE, df)
print("MeanIR resampled:", meanIR_MLSMOTE)
print(f"Tamanho Original:{len(X_train)}")
print(f"Tamanho resampled:{len(X_train_MLSMOTE)}")

In [None]:
mlsmote = MLSMOTE(3)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train_MLROS_MLSMOTE, y_train_MLROS_MLSMOTE = mlsmote.fit_resample(df,X_train_MLROS, y_train_MLROS, 3)
meanIR_MLROS_MLSMOTE = v.meanIR(y_train_MLROS_MLSMOTE, df)
print("MeanIR resampled:", meanIR_MLROS_MLSMOTE)
print(f"Tamanho Original:{len(X_train_MLROS)}")
print(f"Tamanho resampled:{len(X_train_MLROS_MLSMOTE)}")

In [None]:
mlsmote = MLSMOTE(3)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train_MLRUS_MLSMOTE, y_train_MLRUS_MLSMOTE = mlsmote.fit_resample(df,X_train_MLRUS, y_train_MLRUS, 3)
meanIR_MLRUS_MLSMOTE = v.meanIR(y_train_MLRUS_MLSMOTE, df)
print("MeanIR resampled:", meanIR_MLRUS_MLSMOTE)
print(f"Tamanho Original:{len(X_train_MLRUS)}")
print(f"Tamanho resampled:{len(X_train_MLRUS_MLSMOTE)}")

# Classificação
Classificação da base original com MLkNN do SciKit multilearn

In [None]:
#slice in train and test then classify
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# train
classifier = MLkNN(3, 1)
#classifier = DecisionTreeClassifier(max_depth=3,random_state=42).fit(X_train,y_train.toarray())
#classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True,random_state=42))
#y_score = classifier.fit(X_train, y_train)
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)


In [None]:
#metrics
#sciMetricas(y_test, predictions)
print("")
#minhasMetricas(df, y_test, predictions.toarray())

print(metrics.f1_score(y_true=y_test, y_pred=predictions, average='micro'))

#slice in train and test then classify
#X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_resampled, y_resampled, test_size = 0.2)


0.3024940100196036


In [None]:
minhasMetricas(df, y_test, y_pred.toarray())

Metrica exposta na proposta:
Classifier precision score: 0.32


#ROC-AUC

Definição da ROC e a AUC

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

"""Mostra na tela a ROC e a AUC, utilizando MLkNN(5, 1) como método de classíficação.

Entradas:
D = Dataframe com a base completa
X = Features de cada instância
y = Classes multirrótulo de cada instância
"""
def roc_auc(D, X, y):
  classifier = MLkNN(5, 1)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X_test)
  sciMetricas(y_test, y_pred)
  minhasMetricas(df, y_test, y_pred.toarray())
  y_pred_proba = classifier.predict_proba(X_test)
  fpr = dict()
  tpr = dict()
  roc_auc = dict()
  #get labels from df
  list_of_columns = D.columns.to_list()
  r = re.compile("feature.*")
  features = list(filter(r.match, list_of_columns)) # Read Note
  if 'feature0' in features:
    features.remove('feature0')
  labels = (list(set(list_of_columns) - set(features)))
  if 'feature0' in labels:
    labels.remove('feature0')
  for i in range(0, len(labels)):
    fpr[i], tpr[i], _ = metrics.roc_curve(y_test.toarray()[:,i], y_pred_proba.toarray()[:, i])
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])
    lw = 2
    plt.plot(fpr[i], tpr[i], color='darkorange', alpha = 0.1,
            lw=lw)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Falso Positivos')
    plt.ylabel('Verdadeiros Positivos')
    #plt.legend(loc="lower right")
  plt.show()
  print(f"AUC média: {sum(roc_auc.values())/len(roc_auc)}")

In [None]:
roc_auc(df, X_train_MLROS_MLSMOTE, y_train_MLROS_MLSMOTE)