
https://pianalytix.com/multi-label-text-classification/

In [None]:
import pandas as pd
import numpy as np
import pickle
import ast
from matplotlib import pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Sampling and labelling 

In [None]:
def remove_restaurants(lista):
  if "Restaurants" in lista:
    lista.remove("Restaurants")
    return lista
  else:
    return lista

In [None]:
def list_2_string(lista):
  stringa = ' '.join(lista)
  return stringa

In [None]:
reviews = pd.read_csv("/content/drive/MyDrive/Project Text Mining/yelp_dataset/data_preprocessed.csv")
reviews.dropna(subset = ["lemma_text"], inplace = True)
reviews['category'] = reviews['category'].apply(lambda x: ast.literal_eval(x))  

In [None]:
reviews["category"] = reviews["category"].apply(remove_restaurants) 
reviews['SpacyLemma'] = reviews['lemma_text'].apply(list_2_string)

In [None]:
type(reviews.loc[0]['category'])

list

In [None]:
list_businesses = list(reviews.business_id.unique())

In [None]:
len(list_businesses)

315

In [None]:
tags = reviews["category"]

In [None]:
# since the classification models used to take too much time, a sample of 15 random businesses had been taken

sample = reviews[reviews["business_id"].isin(list_businesses[:15])]
tags = sample["category"]

In [None]:
multilabel = MultiLabelBinarizer()
one_hot_labels = multilabel.fit_transform(tags)

In [None]:
print("we have a total of", tags.shape[0], "reviews and", len(multilabel.classes_) ,"classes")

we have a total of 570438 reviews and 199 classes


In [None]:
print("we have a total of", tags.shape[0], "reviews and", len(multilabel.classes_) ,"classes")

we have a total of 27512 reviews and 44 classes


In [None]:
count = pd.DataFrame(one_hot_labels, columns = multilabel.classes_).sum().reset_index()
count.rename(columns={"index" : "class", 0:"n_reviews"}, inplace = True)

In [None]:
# this is the distribution of the labels 
count.sort_values(by="n_reviews", ascending=False)

Unnamed: 0,class,n_reviews
4,Bars,14383
32,Nightlife,14383
9,Breakfast & Brunch,12641
12,Cajun/Creole,11468
0,American (New),10690
21,Food,9299
14,Cocktail Bars,9160
35,Seafood,8712
42,Wine & Spirits,5041
6,Beer,5041


### Train and test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample.lemma_text, one_hot_labels, test_size = 0.3, stratify=one_hot_labels, random_state = 1)

In [None]:
tfidf_vect = TfidfVectorizer(min_df=5, max_features=500)
X_train_tfidf = tfidf_vect.fit_transform(X_train)

In [None]:
X_test_tfidf = tfidf_vect.transform(X_test)

In [None]:
%%time
clf = OneVsRestClassifier(SVC(kernel='linear',probability=True)).fit(X_train_tfidf, y_train)    

CPU times: user 1h 19min 29s, sys: 8.25 s, total: 1h 19min 37s
Wall time: 1h 19min 49s


In [None]:
def map(x):
  # mapping of probabilities
  # if probability <= 0,65  -> the label is not chosen, label = 0 
  # else -> label = 1, the label is chosen
  if x <= 0.65:
    return 0
  else:
    return 1

In [None]:
vfunc = np.vectorize(map)

In [None]:
n = 3

In [None]:
multilabel.inverse_transform(np.expand_dims(y_test[n], axis = 0))

[('Cajun/Creole',)]

In [None]:
X_test.iloc[n]

'hard establishment especially tourist trap area little upscale food life price theyre really expensive either consider fine dining coat tie required still elegant atmosphere many option recommend muriels anyone looking something besides po boy trip new orleans it also one best date ever began extra star that'

In [None]:
proba = vfunc(clf.predict_proba(X_test_tfidf[n]))
multilabel.inverse_transform(proba)

[('Cajun/Creole',)]

In [None]:
filename = "/content/drive/MyDrive/Project Text Mining/yelp_dataset/model_wordnet.pickle"

In [None]:
# save model
#pickle.dump(clf, open(filename, "wb"))

In [None]:
# load model
clf = pickle.load(open(filename, "rb"))

In [None]:
%%time
y_pred = vfunc(clf.predict_proba(X_test_tfidf))

CPU times: user 4min 10s, sys: 511 µs, total: 4min 10s
Wall time: 4min 9s


In [None]:
y_pred[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
print(classification_report(
    y_test,
    y_pred,
    output_dict=False,
    target_names = multilabel.classes_
))

                           precision    recall  f1-score   support

           American (New)       0.91      0.72      0.80      3207
   American (Traditional)       0.89      0.50      0.64       528
     Arts & Entertainment       0.97      0.86      0.91      1168
                 Barbeque       0.94      0.56      0.70       976
                     Bars       0.91      0.78      0.84      4315
            Beauty & Spas       0.99      0.91      0.95       815
                     Beer       0.94      0.72      0.82      1512
                 Beer Bar       0.98      0.82      0.89       733
               Brasseries       0.89      0.60      0.72      1398
       Breakfast & Brunch       0.90      0.80      0.85      3792
                  Burgers       0.94      0.76      0.84       816
                    Cafes       0.93      0.53      0.68      1383
             Cajun/Creole       0.95      0.85      0.90      3441
                  Casinos       0.99      0.91      0.95     

  _warn_prf(average, modifier, msg_start, len(result))


####Spacy lemmatizer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample.SpacyLemma, one_hot_labels, test_size = 0.3, stratify=one_hot_labels, random_state = 1)

In [None]:
tfidf_vect = TfidfVectorizer(min_df=5, max_features=500)
X_train_tfidf = tfidf_vect.fit_transform(X_train)

In [None]:
X_test_tfidf = tfidf_vect.transform(X_test)

In [None]:
%%time
clf = OneVsRestClassifier(SVC(kernel='linear',probability=True)).fit(X_train_tfidf, y_train)    

CPU times: user 1h 15min 57s, sys: 6.73 s, total: 1h 16min 3s
Wall time: 1h 15min 52s


In [None]:
filename = "/content/drive/MyDrive/Project Text Mining/yelp_dataset/model_spacy.pickle"

# save model
pickle.dump(clf, open(filename, "wb"))

# load model
# loaded_model = pickle.load(open(filename, "rb"))

In [None]:
%%time
y_pred = vfunc(clf.predict_proba(X_test_tfidf))

CPU times: user 4min 5s, sys: 229 ms, total: 4min 5s
Wall time: 4min 4s


In [None]:
print(classification_report(
    y_test,
    y_pred,
    output_dict=False,
    target_names = multilabel.classes_
))

                           precision    recall  f1-score   support

           American (New)       0.91      0.73      0.81      3208
   American (Traditional)       0.87      0.51      0.64       528
     Arts & Entertainment       0.97      0.86      0.91      1169
                 Barbeque       0.94      0.57      0.71       976
                     Bars       0.91      0.78      0.84      4317
            Beauty & Spas       0.99      0.92      0.95       815
                     Beer       0.94      0.72      0.82      1513
                 Beer Bar       0.96      0.82      0.89       733
               Brasseries       0.88      0.59      0.71      1399
       Breakfast & Brunch       0.91      0.79      0.84      3793
                  Burgers       0.94      0.78      0.86       816
                    Cafes       0.93      0.55      0.69      1383
             Cajun/Creole       0.94      0.85      0.89      3442
                  Casinos       0.99      0.92      0.95     

  _warn_prf(average, modifier, msg_start, len(result))


### Doc2Vec

In [None]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample.SpacyLemma, one_hot_labels, test_size = 0.3, stratify=one_hot_labels, random_state = 1)

In [None]:
def get_tagged_doc(df, train = None):
    if train == True:
        word = "train_"
    else:
        word = "test_"
    result = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[word + str(i)]) for i, _d in enumerate(list(df))]
    return result

In [None]:
X_train = get_tagged_doc(X_train, train = True)

In [None]:
len(X_train)

19258

In [None]:
len(X_test)

8254

In [None]:
%%time
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=5, epochs = 100)
model.build_vocab(X_train)
model.train(X_train, total_examples=model.corpus_count, epochs=100)
#model.save("doc2vec.model")

CPU times: user 9min 6s, sys: 43.5 s, total: 9min 50s
Wall time: 6min 27s


In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + "_" + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [None]:
train_vectors = get_vectors(model, len(X_train), 50, "train")

In [None]:
%%time
clf = OneVsRestClassifier(SVC(kernel='linear',probability=True)).fit(train_vectors, y_train)

In [None]:
n = 8

In [None]:
multilabel.inverse_transform(np.expand_dims(y_test[n], axis = 0))

[('Breakfast & Brunch',
  'Cafes',
  'Cajun/Creole',
  'Donuts',
  'Food',
  'Sandwiches')]

In [None]:
proba = vfunc(clf.predict_proba(np.expand_dims(model.infer_vector(X_test[n][0]), axis = 0)))
multilabel.inverse_transform(proba)

[('Breakfast & Brunch', 'Cajun/Creole', 'Donuts', 'Sandwiches')]

In [None]:
test_vectors = model.infer_vector(X_test)

In [None]:
test_vectors.shape

(8254, 50)

In [None]:
y_pred = vfunc(clf.predict_proba(test_vectors))

In [None]:
print(classification_report(
    y_test,
    y_pred,
    output_dict=False,
    target_names = multilabel.classes_
))

                           precision    recall  f1-score   support

           American (New)       0.84      0.54      0.66      3207
   American (Traditional)       0.78      0.13      0.22       528
     Arts & Entertainment       0.87      0.72      0.79      1168
                 Barbeque       0.82      0.27      0.40       976
                     Bars       0.84      0.61      0.71      4315
            Beauty & Spas       0.89      0.86      0.87       815
                     Beer       0.80      0.43      0.56      1512
                 Beer Bar       0.81      0.64      0.71       733
               Brasseries       0.77      0.42      0.55      1398
       Breakfast & Brunch       0.82      0.59      0.69      3792
                  Burgers       0.78      0.61      0.69       816
                    Cafes       0.86      0.04      0.09      1383
             Cajun/Creole       0.88      0.70      0.78      3441
                  Casinos       0.89      0.86      0.87     

  _warn_prf(average, modifier, msg_start, len(result))
