In [1]:
import pandas as pd
import ast
from tqdm import tqdm
tqdm.pandas()

from sklearn.preprocessing import MultiLabelBinarizer  
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./../data/processed/data.csv')

In [3]:
df['Text'] = df['Text'].progress_apply(lambda x : ast.literal_eval(x))
df['Tags'] = df['Tags'].progress_apply(lambda x : ast.literal_eval(x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 11310/11310 [00:00<00:00, 13005.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 11310/11310 [00:00<00:00, 72887.37it/s]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Tags'], test_size=0.2, random_state=42)

In [5]:
y_train=MultiLabelBinarizer().fit_transform(y_train)
y_test = MultiLabelBinarizer().fit_transform(y_test)

### Way 1

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

x_train = x_train.apply(lambda x: "".join(x))
x_test = x_test.apply(lambda x: "".join(x))

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.fit_transform(x_test)

### Way 2

In [None]:
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 1000

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)
word_index = tokenizer.word_index
index_to_word = dict((i, w) for w, i in tokenizer.word_index.items())

seq_lens = [len(s) for s in sequences]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)

### Method 1

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0],
    },
    {
        'classifier': [SVC()],
        'classifier__kernel': ['rbf', 'linear'],
    },
]

classifier = GridSearchCV(BinaryRelevance(), parameters, scoring='accuracy')

classifier.fit(x_train, y_train)

print (classifier.best_params_, classifier.best_score_)

### Method 2

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

# initialize Binary Relevance multi-label classifier
# with an SVM classifier
# SVM in scikit only supports the X matrix in sparse representation

classifier = BinaryRelevance(
    classifier = SVC(),
    require_dense = [False, True]
)

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

### Method 3

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

### Method 4

In [None]:
from skmultilearn.adapt import MLkNN

classifier = MLkNN(k=20)

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

accuracy_score(y_test,predictions)

## Predictions

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)