## Install Libraries , Mount Drive & Import Libararies

In [0]:
%tensorflow_version 2.x
!pip install ipython-autotime
!pip install scikit-multilearn
!pip install tqdm - -upgrade

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns
import nltk
%matplotlib inline

In [0]:
from tqdm.auto import tqdm
tqdm.pandas()

## Load Data

In [0]:
data_set = pd.read_pickle(
    '/content/drive/My Drive/rdata/lemma_stop_word_cleaned.pkl')

In [11]:
data_set.head()

Unnamed: 0,title,body,tags
0,good branching merge tutorial tortoisesvn,good tutorial explain branching merge apache s...,[svn]
1,asp.net site map,experience create sqlbase asp.net sitemap prov...,"[sql, asp.net]"
2,function create color wheel,pseudosolve time find solution . stuck -PRON- ...,[algorithm]
3,add script functionality .net application,little game write c # . -PRON- use database ba...,"[c#, .net]"
4,use nested class case,work collection class use video playback recor...,"[c++, oop, class]"


## Filter Top 5

In [0]:
tags_list = [tag for row in data_set.tags.values for tag in row]
unique_tags = list(set(tags_list))
tags_frequency = nltk.FreqDist(tags_list)
top_tags_dict = tags_frequency.most_common(5)
top_tags = [tag_tuple[0] for tag_tuple in top_tags_dict]

In [0]:
def remove_uncommon_tag(tags):
    filtered = []
    for i in range(len(tags)):
        if tags[i] in top_tags:
            filtered.append(tags[i])
    if len(filtered):
        return filtered
    else:
        return None

In [6]:
data_set.tags = data_set.tags.apply(lambda x: remove_uncommon_tag(x))
data_set.head()

Unnamed: 0,title,body,tags
0,good branching merge tutorial tortoisesvn,good tutorial explain branching merge apache s...,
1,asp.net site map,experience create sqlbase asp.net sitemap prov...,
2,function create color wheel,pseudosolve time find solution . stuck -PRON- ...,
3,add script functionality .net application,little game write c # . -PRON- use database ba...,[c#]
4,use nested class case,work collection class use video playback recor...,


In [7]:
data_set.dropna(subset=['tags'], inplace=True)
data_set.shape

(27715, 3)

In [0]:
data_set.reset_index(inplace=True, drop=True)

In [9]:
data_set.head()

Unnamed: 0,title,body,tags
0,add script functionality .net application,little game write c # . -PRON- use database ba...,[c#]
1,automatically update version number,like version property -PRON- application incre...,[c#]
2,connect database loop recordset c #,simple way connect query database set record c #,[c#]
3,value build encode viewstate,need grab base64encode representation viewstat...,[c#]
4,delete file lock process c #,look way delete file lock process use c # . su...,[c#]


## Prepare Y & X for Training

In [0]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data_set.tags)

In [0]:
X = data_set.drop(columns=['tags'])

In [0]:
X['data'] = X.title+X.body

In [0]:
vectorizer = TfidfVectorizer()
X_data = vectorizer.fit_transform(X.data)

In [14]:
X_data.shape

(27715, 76470)

## Train Test Split & Train

In [0]:
from skmultilearn.model_selection import iterative_train_test_split

X_train, y_train, X_test, y_test = iterative_train_test_split(
    X_data, y, test_size=0.33)

In [16]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)
# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))

Accuracy =  0.3712692686126599


## Eval Metrics

In [0]:
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss

In [18]:
roc_auc_score(y_test, predictions.toarray())

0.7181838619281786

In [20]:
hamming_loss(y_test, predictions.toarray())

0.26830654859516784

In [21]:
f1_score(y_test, predictions.toarray(), average='macro')

0.5172281727114033