In [17]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import classification_report

import ast
from tqdm import tqdm
tqdm.pandas()


df = pd.read_csv("./../data/processed/data_10.csv")

df['Text'] = df['Text'].progress_apply(lambda x : ast.literal_eval(x))
df['Tags'] = df['Tags'].progress_apply(lambda x : ast.literal_eval(x))

df['Text'] = df['Text'].progress_apply(lambda x : ' '.join(map(str, x)))
df['Tags'] = df['Tags'].progress_apply(lambda x : ' '.join(map(str, x)))

vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true', min_df=1)
vectorizer.fit(df['Tags']) #tags

y = vectorizer.transform(df['Tags']).toarray()

tags_dict = vectorizer.vocabulary_


tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['Text']).toarray()
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:08<00:00, 5023.48it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:00<00:00, 53491.88it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:00<00:00, 181011.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 44296/44296 [00:00<00:00, 1188209.99it/s]


In [18]:
# clf = MultinomialNB().fit(X_train, y_train)

classifier = OneVsRestClassifier(GaussianNB())

# train
classifier.fit(X_train, y_train)

In [15]:
y_test

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [14]:
### MultinomialNB

y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.01      0.03      1102
           1       0.00      0.00      0.00       592
           2       0.71      0.00      0.01      1501
           3       0.00      0.00      0.00       644
           4       0.80      0.01      0.01       640
           5       0.94      0.01      0.02      1319
           6       0.33      0.00      0.00      1209
           7       0.00      0.00      0.00       995
           8       0.73      0.01      0.01      1314
           9       0.00      0.00      0.00       636

   micro avg       0.72      0.00      0.01      9952
   macro avg       0.44      0.00      0.01      9952
weighted avg       0.52      0.00      0.01      9952
 samples avg       0.01      0.01      0.01      9952



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
### GaussianNB

y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.25      0.59      0.35      1102
           1       0.12      0.42      0.19       592
           2       0.25      0.60      0.35      1501
           3       0.15      0.45      0.23       644
           4       0.19      0.54      0.28       640
           5       0.21      0.55      0.31      1319
           6       0.21      0.54      0.30      1209
           7       0.23      0.53      0.32       995
           8       0.24      0.56      0.33      1314
           9       0.12      0.40      0.19       636

   micro avg       0.21      0.54      0.30      9952
   macro avg       0.20      0.52      0.29      9952
weighted avg       0.21      0.54      0.30      9952
 samples avg       0.27      0.54      0.31      9952



  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.06      0.06      0.06        17
           2       0.00      0.00      0.00         3
           3       0.14      0.22      0.17         9
           4       0.33      0.43      0.38         7
           5       0.67      0.74      0.70        42
           6       0.00      0.00      0.00         2
           7       0.50      0.40      0.44         5
           8       0.00      0.00      0.00         1
           9       1.00      0.29      0.44         7
          10       0.50      0.33      0.40        24
          11       0.73      0.73      0.73        11
          12       0.20      0.20      0.20         5
          13       0.50      0.50      0.50         4
          14       0.00      0.00      0.00        10
          15       0.28      0.20      0.23        66
          16       0.19      0.39      0.25        23
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
