In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import numpy as np
from wordcloud import WordCloud, STOPWORDS
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from itertools import chain
import re
from tensorflow.keras.preprocessing.text import one_hot
from yellowbrick.classifier import ROCAUC
import unicodedata
from gensim.models.fasttext import FastText
from sklearn.metrics import roc_curve,auc, roc_auc_score, cohen_kappa_score, brier_score_loss
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
import tensorflow as tf
from itertools import cycle
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
%matplotlib inline

In [None]:
w_mask=np.array(Image.open("/kaggle/input/word-cloud-5/stormtrooper_mask.png"))

In [None]:
df=pd.read_csv("/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv",names=["label","text"])
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.label.value_counts()

In [None]:
plt.figure(figsize=(7, 5))
ax = sns.countplot(x="label", data=df, palette="tab10")

for p in ax.patches:
    ax.annotate('{}'.format(int(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5), textcoords='offset points')

plt.xlabel("Label")
plt.ylabel("Count")
plt.title("Distribution of Labels with Count Annotations")
plt.xticks(rotation=45)
plt.show()



In [None]:
all_text=df['text'].str.cat(sep=' ')
plt.figure(figsize=(12,8))
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width=800, height=800,max_words=1000,mask=w_mask, background_color='black',colormap="nipy_spectral", stopwords=stopwords).generate(all_text)

plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()


In [None]:
label_encoder = LabelEncoder()
df['t_label'] = label_encoder.fit_transform(df['label'])
df.head()

In [None]:
df['decoded_category'] = label_encoder.inverse_transform(df['t_label'])
print(df[['label', 't_label', 'decoded_category']])

In [None]:
df[['label', 't_label', 'decoded_category']].value_counts()

In [None]:
df.drop(["label","decoded_category"],axis=1,inplace=True)


##### Books======                    0  
##### Clothing & Accessories======   1
##### Electronics======              2       
##### Household =======              3  

In [None]:
df.head()

In [None]:
df2=df.copy()

In [None]:
b_len=df[df["t_label"]==0]["text"].str.len()
c_len=df[df["t_label"]==1]["text"].str.len()
e_len=df[df["t_label"]==2]["text"].str.len()
h_len=df[df["t_label"]==3]["text"].str.len()


fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.flatten()
for label, ax in zip(range(4), axes):
    text_lengths = df[df["t_label"] == label]["text"].str.len()
    sns.histplot(text_lengths, bins=20, color=sns.color_palette()[label], ax=ax, kde=True)

    ax.set_title(f'Distribution of Text Lengths for Label {label}')
    ax.set_xlabel('Text Length')
    ax.set_ylabel('Frequency')
    ax.grid(True)

plt.tight_layout()
plt.show()


In [None]:
data_set =df["text"].str.split()
all_words = list(chain.from_iterable(data_set))
counter = Counter(all_words)
common_words = counter.most_common(30)
df_common_words = pd.DataFrame(common_words, columns=['Word', 'Count'])

colors = ["cyan", "lime", "magenta", "gold", "purple", "tomato", "teal", "sandybrown", "mediumseagreen",
          "royalblue", "darkorchid", "darkturquoise", "darkgoldenrod", "mediumvioletred", "mediumaquamarine",
          "lightcoral", "darkslategray", "olivedrab", "dodgerblue", "indianred", "limegreen", "steelblue",
          "darkviolet", "chocolate", "mediumslateblue", "darkgreen", "orangered", "mediumblue", "peru", "mediumspringgreen"]

plt.figure(figsize=(12, 6))
sns.barplot(x='Count', y='Word', data=df_common_words, palette=colors)
plt.title('30 Most Common Words')
plt.xlabel('Count')
plt.ylabel('Word')
plt.show()

# Cleaning Data

In [None]:
df["text"][100]

In [None]:
from nltk.corpus import stopwords
def clean_text(text):

    # Remove HTML tags if present
    if "<" in text:
        text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove URL addresses
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove accented characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Remove punctuation
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

    # Remove irrelevant characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)

    return text

In [None]:

df["text"] = df["text"].apply(clean_text)

In [None]:
df["text"][100]

In [None]:
avg_len=df["text"].str.len().mean()
print("Average text length :",avg_len)

In [None]:
df.head()

#  TfidfVectorizer

In [None]:

text_data=df["text"]
tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(text_data)

print("TF-IDF Matrix (sparse representation):\n", tfidf_matrix)

In [None]:
vocabulary =tfidf.get_feature_names_out()
print("Vocabulary:", vocabulary)


In [None]:
label_data=df["t_label"]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(tfidf_matrix, label_data, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB()
model.fit(X_train, Y_train)

In [None]:
pred=model.predict(X_test)
accuracy = accuracy_score(Y_test, pred)
print("Accuracy:", accuracy)

In [None]:
label_name=["Books","Clothing & Accessories","Electronics","Household"]
cf=confusion_matrix(Y_test,pred)
plt.figure(figsize=(6,5))
sns.heatmap(cf,annot=True,fmt="d",cmap="hsv",xticklabels=label_name,yticklabels=label_name)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
print(classification_report(Y_test,pred,target_names=label_name))

In [None]:
Y_test.shape

In [None]:
plt.figure(figsize=(8,5))
visualizer = ROCAUC(model, classes=[0, 1, 2, 3])
visualizer.fit(X_train, Y_train)
visualizer.score(X_test, Y_test)
visualizer.show()

In [None]:
kappa = cohen_kappa_score(Y_test,pred)
plt.figure(figsize=(6,4))
plt.plot([])
plt.text(0,0, f'Cohen Kappa Score: {kappa:.4f}', fontsize=16, ha='center', va='center',color="red")
plt.axis('off')

# Set the x-axis limits
plt.xlim(-1, 1)
plt.ylim(-1,1)

plt.show()

In [None]:
y_prob = model.predict_proba(X_test)

roc_auc = roc_auc_score(Y_test,y_prob, multi_class='ovr')
plt.figure(figsize=(6,4))
plt.plot([])

plt.text(0, 0, f'ROC AUC Score: {roc_auc:.4f}', fontsize=16, ha='center', va='center', color="green")
plt.axis('off')

plt.xlim(-1, 1)
plt.ylim(-1, 1)


plt.show()


In [None]:
brier_loss_per_class = [brier_score_loss((Y_test == c).astype(int),y_prob[:, c]) for c in range(y_prob.shape[1])]
mean_brier_loss = np.mean(brier_loss_per_class)
plt.figure(figsize=(6, 4))
plt.plot([])
plt.text(0, 0, f'Mean Brier Score Loss: {mean_brier_loss:.4f}', fontsize=16, ha='center', va='center', color="blue")
plt.axis('off')
plt.xlim(-1, 1)
plt.ylim(-1, 1)
plt.show()

# Fasttext==>(skipgram)+Onehot+GRU

In [None]:
max_len=500
text_list =df["text"].str.split().tolist()
my_label=df["t_label"].values

# Fasttext With Skipgram

In [None]:
fasttext_model = FastText(text_list, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Check word embedding for a perticular word

In [None]:
fasttext_model.wv['king']

# Dimention must be 100

In [None]:
fasttext_model.wv['king'].shape

# Check top 10 similar word for a given word by gensim fastText

In [None]:
fasttext_model.wv.most_similar('wife', topn=10)

# Check top 10 similarity score between two word

In [None]:
fasttext_model.wv.similarity('beer', 'drink')


# Most opposite to a word

In [None]:
fasttext_model.wv.most_similar(negative=["cat"], topn=10)


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_list)
text_seq=tokenizer.texts_to_sequences(text_list)
pad_seq=pad_sequences(text_seq,maxlen=max_len)

In [None]:
vocab_size=len(fasttext_model.wv.key_to_index)
vocab=tokenizer.word_index
print("Vocab Size :",vocab_size)
# print(vocab)

In [None]:
embedding_matrix = fasttext_model.wv.vectors
embedding_matrix.shape

# Onehot

In [None]:
texts = df['text']
one_hot_texts = [one_hot(text,vocab_size) for text in texts]
df['text'] = one_hot_texts

print(df.head())

In [None]:
my_onehot_data=df["text"]
embedded_doc=pad_sequences(my_onehot_data,padding='pre',maxlen=max_len)
embedded_doc

In [None]:
my_text=np.array(embedded_doc)
y=df["t_label"]
my_label=np.array(y)

In [None]:
X_train0, X_test0, Y_train0, Y_test0 = train_test_split(my_text,my_label, test_size=0.2, random_state=42)

In [None]:
print("Train text Data shape :",X_train0.shape)
print("====================================================")
print("Test text Data shape :",X_test0.shape)
print("====================================================")
print("Train label Data shape :",Y_train0.shape)
print("====================================================")
print("Train label Data shappendape :",Y_test0.shape)

# GRU Model

In [None]:
embedding_dim=100
model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,weights=[embedding_matrix],input_length=max_len))
model2.add(GRU(units=128, activation="tanh"))
model2.add(Dropout(0.3))
model2.add(Dense(4, activation='softmax'))
model2.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics=['accuracy'])
model2.summary()

In [None]:
history = model2.fit(X_train0, Y_train0, validation_data=(X_test0, Y_test0), epochs=10, batch_size=32)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))

# First subplot
ax[0].plot(history.history['accuracy'],label="Accuracy",color="blue")
ax[0].plot(history.history['val_accuracy'],label="Validation Accuracy",color="brown")
ax[0].set_title('Model Accuracy')
ax[0].set_ylabel('Accuracy')
ax[0].set_xlabel('Epoch')
ax[0].legend(loc='best')

# Second subplot
ax[1].plot(history.history['loss'],label="Loss",color="orange")
ax[1].plot(history.history['val_loss'],label="Validation Loss",color="purple")
ax[1].set_title('Model Loss')
ax[1].set_ylabel('Loss')
ax[1].set_xlabel('Epoch')
ax[1].legend(loc='best')

plt.show()


In [None]:
pred2=model2.predict(X_test0)
final_predict=np.argmax(pred2,axis=1)

# Confusion Matrix

In [None]:
label_name=["Books","Clothing & Accessories","Electronics","Household"]
cf=confusion_matrix(Y_test0,final_predict)
plt.figure(figsize=(6,5))
sns.heatmap(cf,annot=True,fmt="d",cmap="cool",xticklabels=label_name,yticklabels=label_name)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification Report

In [None]:
print(classification_report(Y_test0,final_predict,target_names=label_name))

# RoC Curve

In [None]:
n_classes = 4
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_test0 == i, pred2[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves
plt.figure()
colors = cycle(['blue', 'red', 'green', 'purple'])  # Add more colors as needed
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) for Multi-class')
plt.legend(loc="lower right")
plt.show()

# cohen_kappa_score

In [None]:
kappa = cohen_kappa_score(Y_test0,final_predict)
plt.figure(figsize=(6,4))
plt.plot([])
plt.text(0,0, f'Cohen Kappa Score: {kappa:.4f}', fontsize=16, ha='center', va='center',color="red")
plt.axis('off')

# Set the x-axis limits
plt.xlim(-1, 1)
plt.ylim(-1,1)

plt.show()

# Roc Auc score

In [None]:
roc_auc = roc_auc_score(Y_test0,pred2, multi_class='ovr')
plt.figure(figsize=(6,4))
plt.plot([])

plt.text(0, 0, f'ROC AUC Score: {roc_auc:.4f}', fontsize=16, ha='center', va='center', color="green")
plt.axis('off')

plt.xlim(-1, 1)
plt.ylim(-1, 1)


plt.show()

# brier_score_loss

In [None]:
pred_probabilities = model2.predict(X_test0)
one_hot_true = np.zeros_like(pred_probabilities)
one_hot_true[np.arange(len(Y_test)), Y_test] = 1
brier_loss = np.mean(np.sum((pred_probabilities - one_hot_true)**2, axis=1))
plt.figure(figsize=(6,4))
plt.plot([])

plt.text(0, 0, f'Brier Loss: {brier_loss:.4f}', fontsize=16, ha='center', va='center', color="green")
plt.axis('off')

plt.xlim(-1, 1)
plt.ylim(-1, 1)


plt.show()


```
Books====== 0

Clothing & Accessories====== 1

Electronics====== 2

Household ======= 3
```

# Custom Data Prediction

In [None]:
df2[df2["t_label"]==3]["text"].head(20)[19]

In [None]:
custom_texts = ["Path of Meditation: A Step-by-step Guide to Meditation The culture that will arise in the future, if it is truly to be for the evolution of mankind, will be a balance of science and religion. This culture will be a synthesis of religion and science. It will not be only religious or only scientific: it will be either scientifically religious or religiously scientific Osho The talks in this book are transcriptions of a meditation program led by Osho in the beautiful hills of Mahabaleshwar. It is a step by step account of how to prepare the body, mind and emotions to enhance your meditation. Osho guides the participants and the reader alike in the use of powerful meditation techniques and suggests many useful applications for supporting meditation in your daily life. He also describes the different stages that happen on the path of meditation. This book is a must-read for both the new and the experienced meditator. Even more, it is an invitation to experiment with these life-transformi"]
custom_one_hot_texts = [one_hot(text, vocab_size) for text in custom_texts]
custom_embedded_doc = pad_sequences(custom_one_hot_texts, padding='pre', maxlen=max_len)

custom_predictions = model2.predict(custom_embedded_doc)

predicted_classes = np.argmax(custom_predictions, axis=1)
class_probabilities = np.max(custom_predictions, axis=1)

for i in range(len(custom_texts)):
    print(f"Predicted Class: {predicted_classes[i]}, Probability: {class_probabilities[i]}")


In [None]:
custom_texts = ["ART DIOR | Dancing Village Girls | Canvas Wall Art | Unframed Canvas Art Print | 18 inch x 46 inch | Enclosure Material:Canvas Art Print\xa0|\xa0 Size:18 x 46 Art Print Only   Giclée art from Creative team of Art Dior. Printed by dedicated art printer on Epson 9900 on very specialized coated canvas to bring out brush stroke details, brilliance of colors, dimensional stability and lasting colors for archival use. Add soul to your walls, add volumes of euphoric feel once properly framed and displayed on Living, Drawing & Dining Room, Gallery, Hotels, Bar, Lounge, Restaurants, Office, Reception, Kitchen Area, Bathroom. Packed with utmost care for mint fresh condition delivery. Art prints are of very specialized fine ink quality and color gamut to maintain color depth and brilliance, direct sunlight should be avoided. Please confirm size before placing any order. TERMS & CONDITIONS B"]
custom_one_hot_texts = [one_hot(text, vocab_size) for text in custom_texts]
custom_embedded_doc = pad_sequences(custom_one_hot_texts, padding='pre', maxlen=max_len)

custom_predictions = model2.predict(custom_embedded_doc)

predicted_classes = np.argmax(custom_predictions, axis=1)
if predicted_classes==3:
  print("Household")
else:
  print("Others")

class_probabilities = np.max(custom_predictions, axis=1)

for i in range(len(custom_texts)):
    print(f"Predicted Class: {predicted_classes[i]}, Probability: {class_probabilities[i]}")


# References

https://thinkinfi.com/fasttext-word-embeddings-python-implementation/

https://saturncloud.io/blog/how-to-use-scikitlearns-multiclass-roc-auc-score-for-model-evaluation/#:~:text=The%20Multiclass%20ROC%20AUC%20score%20is%20a%20metric%20that%20combines,that%20perfectly%20predicts%20all%20classes

https://www.scikit-yb.org/en/latest/api/classifier/rocauc.html