In [16]:
import pandas as pd 
import numpy as np 

In [17]:
perguntas = pd.read_csv(
    "https://raw.githubusercontent.com/alura-cursos/alura_classificacao_multilabel/master/dataset/stackoverflow_perguntas.csv")
perguntas.head(5)

Unnamed: 0,Perguntas,Tags
0,Possuo um projeto Node.js porém preciso criar ...,node.js
1,"Gostaria de fazer testes unitários no Node.js,...",node.js
2,Como inverter a ordem com que o jQuery itera u...,jquery
3,Eu tenho uma página onde pretendo utilizar um ...,html
4,Como exibir os dados retornados do FireStore e...,html angular


## Data Labelling
--- 
exploding the tags to a tuple and one hot enconding

In [18]:
labels = []
for tags in perguntas.Tags.str.strip().unique():
    for tag in tags.split():
        if tag not in labels:
            labels.append(tag)

# one hot encoding
for label in labels:
    perguntas[label] = perguntas.Tags.apply(lambda x: int(label in str(x)))
    
# tuple
perguntas['todas_tags'] = perguntas.apply(lambda x: tuple([x[label] for label in labels]),axis=1)
perguntas.head(2)

Unnamed: 0,Perguntas,Tags,node.js,jquery,html,angular,todas_tags
0,Possuo um projeto Node.js porém preciso criar ...,node.js,1,0,0,0,"(1, 0, 0, 0)"
1,"Gostaria de fazer testes unitários no Node.js,...",node.js,1,0,0,0,"(1, 0, 0, 0)"


## Data split

In [19]:
from sklearn.model_selection import train_test_split

perguntas_treino, perguntas_test, tags_treino, tags_test = train_test_split(
    perguntas.Perguntas, perguntas.todas_tags, test_size=0.2, random_state=123
)

## TFidf
---
uses how many times this words appears as a inverse weight, but if is too much we create a limit in our example if a word appears in 85% of the dataset we drop this word in the bag of words vocabulary

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import hamming_loss

vetorizar = TfidfVectorizer(max_features=5000,max_df=0.85)
vetorizar.fit(perguntas.Perguntas)

perguntas_treino,perguntas_test = vetorizar.transform(perguntas_treino),vetorizar.transform(perguntas_test)

tags_treino,tags_test = np.asarray(list(tags_treino)),np.asarray(list(tags_test))

## Binary Relevance
--- 
transform the multilabel problem into lots of binary classifications and them merge them

In [25]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = OneVsRestClassifier(LogisticRegression(solver='lbfgs'))
clf.fit(perguntas_treino,tags_treino)
print(clf.score(perguntas_test,tags_test))
print(hamming_loss(clf.predict(perguntas_test),tags_test))

0.4168207024029575
0.1883086876155268


In [22]:
from skmultilearn.problem_transform import BinaryRelevance

clf = BinaryRelevance(LogisticRegression(solver='lbfgs'))
clf.fit(perguntas_treino,tags_treino)
print(clf.score(perguntas_test,tags_test))
print(hamming_loss(clf.predict(perguntas_test),tags_test))

0.4168207024029575
0.1883086876155268


## ClassifierChain 
uses the prediction of the last feature as a new feature, ex: uses the classification of node.js as feature to predict HTML

In [23]:
#!pip install scikit-multilearn
from skmultilearn.problem_transform import ClassifierChain

clf = ClassifierChain(LogisticRegression(solver='lbfgs')) 
clf.fit(perguntas_treino,tags_treino)
print(clf.score(perguntas_test,tags_test))
print(hamming_loss(clf.predict(perguntas_test),tags_test))

0.49815157116451014
0.21095194085027727


## Multilabel KNN
--- 
a adaption of the KNN to multilabel problem, uses the probabily as the k Neighboors to predict the labels

In [24]:
from skmultilearn.adapt import MLkNN

clf = MLkNN()
clf.fit(perguntas_treino,tags_treino)
print(clf.score(perguntas_test,tags_test))
print(hamming_loss(clf.predict(perguntas_test),tags_test))

0.32532347504621073
0.25231053604436227
