# MULTI-LABEL ML CLASSIFICATION NLP

Source code and approach: 
https://www.analyticsvidhya.com/blog/2019/04/predicting-movie-genres-nlp-multi-label-classification/

Source datasets: 
https://www.kaggle.com/datasets/meetnaren/goodreads-best-books/data
https://www.kaggle.com/datasets/ishikajohari/best-books-10k-multi-genre-data/data

source reference: https://towardsdatascience.com/keras-tell-me-the-genre-of-my-book-a417d213e3a1



## Data exploration 

In [None]:
import pandas as pd
df1 = pd.read_csv('../assets/multilabel_book_dataset.csv')
df2 = pd.read_csv('../assets/multilabel_book_dataset2.csv')
df2 = (df2.drop(columns=['Unnamed: 0', 'URL'])
       .rename(columns= {'Book': 'title', 'Author':'author', 'Genres':'genres', 'Description':'description'})
)

# combine the two dfs
df = pd.concat([df1,df2])
df.info()
df.shape

: 

In [None]:
df.duplicated(subset=['title']).sum()

: 

In [None]:
# remove empty and duplicates
df = (df.query('genres != "[]"')
    .dropna(subset=['description'])
    .drop_duplicates(subset=['description','title'])
)
df.shape

: 

In [None]:
# remove non english descriptions
from langdetect import detect

def detect_language(description):
    try:
        return detect(description)
    except Exception as e:
        print(f"Error: {e}")
        return 'unknown'
    
df['Language_detect'] = df['description'].apply(detect_language)
df = df.query('Language_detect=="en"')

: 

In [None]:
df.genres.value_counts()

: 

In [None]:
df.head()

: 

## data cleaning 

In [None]:
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

: 

In [None]:
books = df[['title','description','genres']].copy()

: 

In [None]:
books.shape

: 

In [None]:
from ast import literal_eval
books['genres'] = books['genres'].apply(literal_eval)
all_categories = sum(books['genres'], [])
print('numer of categories before cleaning', len(set(all_categories)))

: 

In [None]:
all_genres = nltk.FreqDist(all_categories) 

# create dataframe
all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()), 
                              'Count': list(all_genres.values())})

: 

In [None]:
# remove categories with less than 100 counts
remove_books = all_genres_df.query('Count < 100')['Genre'].tolist()
books['new_genres'] = books['genres'].apply(lambda genre_list: [genre for genre in genre_list if genre not in remove_books])
books = books.dropna(subset=['new_genres','description'])
all_new_genres = sum(books['new_genres'], [])
print(f"number of genres after cleaning:",len(set(all_new_genres)))

: 

In [None]:
g = all_genres_df.nlargest(columns="Count", n = 50) 
plt.figure(figsize=(12,15)) 
ax = sns.barplot(data=g, x= "Count", y = "Genre") 
ax.set(ylabel = 'Count') 
plt.show()


: 

## data pre-processing 

In [None]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text
books['description'] = books['description'].apply(lambda x: clean_text(x))


: 

In [None]:
books.sample(3)


: 

In [None]:
# remove stopwords

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

books['clean_descriptiont'] = books['description'].apply(lambda x: remove_stopwords(x))


: 

In [None]:
# genres to features - VECTORIZATION
import pickle 
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(books['new_genres'])

# transform target variable
y = multilabel_binarizer.transform(books['new_genres'])

# store vectoriser for later 
pickle.dump(multilabel_binarizer, open("../assets/multilabel_binarizer.pickle", "wb"))

: 

In [None]:
y.shape

: 

In [None]:
xtrain, xval, ytrain, yval = train_test_split(books['clean_descriptiont'], y, test_size=0.2, random_state=9)

: 

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

pickle.dump(tfidf_vectorizer, open("../assets/tfidf_vectorizer.pickle", "wb"))

: 

In [None]:
# Train ML model 

: 

In [None]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

lr = LogisticRegression(max_iter=1000)
clf = OneVsRestClassifier(lr)

# fit model on train data
clf.fit(xtrain_tfidf, ytrain)

# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

y_pred[3]

: 

In [None]:
# save model
pickle.dump(clf, open("../assets/ML2", "wb"))

: 

In [None]:
multilabel_binarizer.inverse_transform(y_pred)[3]

: 

In [None]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

: 

In [None]:
# predict probabilities
y_pred = clf.predict(xval_tfidf)

: 

In [None]:
# see prediction on test data
results = pd.DataFrame(xval)
results['genre_predicted'] = multilabel_binarizer.inverse_transform(y_pred)
results.merge(books[['title', 'genres']], left_index=True, right_index=True)

: 

# Program 

In [None]:
import pandas as pd

: 

In [None]:
# function for text cleaning 
import re
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

# remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)



: 

In [None]:
# load vectoriser, multilabel Binarizer and the model 
import pickle
tfidf_vectorizer = pickle.load(open("../assets/tfidf_vectorizer.pickle", "rb"))
loaded_model = pickle.load(open("../assets/ML2", "rb"))
multilabel_binarizer = pickle.load(open("../assets/multilabel_binarizer.pickle", "rb"))

: 

In [None]:
from ast import literal_eval
def infer_tags(q):
    q = clean_text(q)
    q = remove_stopwords(q)
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

: 

In [None]:
mybooks = pd.read_pickle('../assets/my_books.pkl')
mybooks = mybooks.query('Description.notna()')

# remove non english
from langdetect import detect
mybooks['Language_detect'] = mybooks['Description'].apply(detect)
mybooks = (
    mybooks.query('Language_detect=="en"')
    .drop(columns=['Language_detect'])
    )

: 

In [None]:
for i in range(50): 
  k = mybooks.sample(1).index[0]
  print("Movie: ", mybooks['Title'][k], "\nPredicted genre: ", infer_tags(mybooks['Description'][k]))

: 

In [None]:
# predict genre and add it to a new column "genres"
mybooks['genres'] = mybooks['Description'].apply(infer_tags)

def makelist(list1):
    return str(list1).replace('(','').replace(')','').replace(',]',']')

mybooks['genres'] = mybooks['genres'].apply(makelist)


: 

In [None]:
mybooks[['Title','genres']].sample(50)

: 

In [None]:
pd.to_pickle(mybooks, '../assets/my_books_genres.pickle')

: 