In [16]:
import pandas as pd

from sklearn.utils import shuffle
df = pd.read_csv(
    'https://raw.githubusercontent.com/sugatagh/E-commerce-Text-Classification/main/Dataset/ecommerceDataset.csv', 
    names = ['label', 'description']
)
df = df[['description', 'label']]




In [17]:
df.drop_duplicates(inplace = True) 

df = df.dropna(subset=['description'])

df.reset_index(drop = True, inplace = True)

df = shuffle(df)

df = df[0:1000]

### EDA

In [18]:
# Calculate the length of each text
df['text_length'] = df['description'].apply(len)

# Get the unique categories
categories = df['label'].unique()

In [19]:
import plotly.graph_objects as go
import plotly.express as px


# Create a histogram for each category
fig = go.Figure()
for category in categories:
    fig.add_trace(go.Histogram(x=df[df['label'] == category]['text_length'], 
                               name=category))

# Update layout for better visualization
fig.update_layout(barmode='stack',
                  xaxis_title='Text Length',
                  yaxis_title='Count',
                  title='Histogram of Text Length in Each Category',
                  autosize=False,
                  width=1000,
                  height=600)

fig.show()

In [20]:
fig = px.box(df, x='label', y='text_length', color='label')

# Update layout for better visualization
fig.update_layout(title='Box Plot of Text Length for Each Category',
                  autosize=False,
                  width=1000,
                  height=600)

fig.show()

In [21]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df_out = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]
    return df_out

# Apply the function to each category
df_clean = pd.concat([remove_outliers(sub_df, 'text_length') for category, sub_df in df.groupby('label')])

In [22]:
# Create a histogram for each category
fig = go.Figure()
for category in categories:
    fig.add_trace(go.Histogram(x=df_clean[df_clean['label'] == category]['text_length'], 
                               name=category))

# Update layout for better visualization
fig.update_layout(barmode='stack',
                  xaxis_title='Text Length',
                  yaxis_title='Count',
                  title='Histogram of Text Length in Each Category',
                  autosize=False,
                  width=1000,
                  height=600)

fig.show()

In [23]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def cat_summary_with_graph(dataframe, col_name):
    # Define colors
    colors = ['#494BD3', '#E28AE2', '#F1F481', '#79DB80', '#DF5F5F',
              '#69DADE', '#C2E37D', '#E26580', '#D39F49', '#B96FE3']
    
    # Create subplots
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=('Countplot', 'Percentages'),
                        specs=[[{"type": "xy"}, {'type': 'domain'}]])
    
    # Countplot
    value_counts = dataframe[col_name].value_counts()
    fig.add_trace(go.Bar(y=value_counts.values.tolist(),
                         x=[str(i) for i in value_counts.index],
                         text=value_counts.values.tolist(),
                         textfont=dict(size=15),
                         name=col_name,
                         textposition='auto',
                         showlegend=False,
                         marker=dict(color=colors,
                                     line=dict(color='#DBE6EC', width=1))),
                  row=1, col=1)
    
    # Pie chart
    fig.add_trace(go.Pie(labels=value_counts.keys(),
                         values=value_counts.values,
                         textfont=dict(size=20),
                         textposition='auto',
                         showlegend=False,
                         name=col_name,
                         marker=dict(colors=colors)),
                  row=1, col=2)
    
    # Update layout
    fig.update_layout(title={'text': col_name,
                             'y': 0.9,
                             'x': 0.5,
                             'xanchor': 'center',
                             'yanchor': 'top'},
                      template='plotly_white')
    
    fig.show()


# Call the function
cat_summary_with_graph(df_clean, 'label')


In [24]:

from typing import Iterable
from nltk.util import ngrams
from collections import Counter

class GetNgramsFrequency:
    
    def __init__(self, text_list : Iterable) -> None:
        self.text_list = text_list
        self.unigram = None
        self.bigram = None
        self.trigram = None
        
    def find_n_gram(self, size, top_n):
        ngrams_all = []
        for document in self.text_list:
            tokens = document.split()
            if len(tokens) <= size:
                continue
            else:
                output = list(ngrams(tokens, size))
            for ngram in output:
                ngrams_all.append(" ".join(ngram))
        cnt_ngram = Counter()
        for word in ngrams_all:
            cnt_ngram[word] += 1
        df = pd.DataFrame.from_dict(cnt_ngram, orient='index').reset_index()
        df = df.rename(columns={'index':'words', 0:'count'})
        df = df.sort_values(by='count', ascending=False)
        df = df.head(top_n)
        df = df.sort_values(by='count')
        return(df)
    

    def generate_n_grams(self, top_n):
        
        self.unigram = self.find_n_gram(size=1, top_n=top_n)
        self.bigram = self.find_n_gram(size=2, top_n=top_n)
        self.trigram = self.find_n_gram(size=3, top_n=top_n)
        
        
    def plot_distribution(self):
        fig = make_subplots(rows=1, cols=3, subplot_titles=('Unigrams', 'Bigrams', 'Trigrams'))
        fig.add_trace(go.Bar(x=self.unigram['count'], y=self.unigram['words'], orientation='h', marker=dict(opacity=0.5)), row=1, col=1)
        fig.add_trace(go.Bar(x=self.bigram['count'], y=self.bigram['words'], orientation='h', marker=dict(opacity=0.5)), row=1, col=2)
        fig.add_trace(go.Bar(x=self.trigram['count'], y=self.trigram['words'], orientation='h', marker=dict(opacity=0.5)), row=1, col=3)
        fig.update_layout(height=700, width=1500, showlegend=False)
        fig.update_xaxes(title_text='Count', row=1, col=1)
        fig.update_xaxes(title_text='Count', row=1, col=2)
        fig.update_xaxes(title_text='Count', row=1, col=3)
        fig.show()

In [25]:
n_gram_analyzer = GetNgramsFrequency(text_list=df_clean["description"])
n_gram_analyzer.generate_n_grams(top_n=10)

In [26]:
n_gram_analyzer.plot_distribution()

### Text Cleaning

In [27]:
from utils import TextCleaner
from utils import TextPreprocess

In [28]:
text_cleaner = TextCleaner()

In [29]:
df_clean["cleaned_text"] = df_clean["description"].apply(text_cleaner)

In [30]:
processor = TextPreprocess()

In [31]:
df_clean["cleaned_text_processed"] = df_clean["cleaned_text"].apply(processor)

In [32]:
df_clean.drop(['description',"text_length","cleaned_text"], axis= 1 , inplace=True)
df_clean.rename({"cleaned_text_processed" : "text"}, inplace=True)
df_clean.reset_index(inplace=True)

In [36]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [37]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [38]:
encoder = LabelEncoder()
df_clean.label = encoder.fit_transform(df_clean.label)

df_clean.head()

Unnamed: 0,index,label,cleaned_text_processed
0,12943,0,sapien brief histori humankind review recommen...
1,11862,0,ibp 2019 specialist offic market offic scale p...
2,12292,0,offici toefl ibt test vol 2 dvd author educ te...
3,15388,0,anim shape modern medicin one health histori m...
4,15588,0,atla osteopath techniqu


In [39]:
df_clean.drop(['index'], axis= 1 , inplace=True)

In [40]:
train_X, test_X, train_Y, test_Y = train_test_split(df_clean['cleaned_text_processed'], df_clean['label'], train_size = 0.7, shuffle = True)

In [41]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [42]:
train_tokens = tokenizer(list(train_X), padding = True, truncation=True)
test_tokens = tokenizer(list(test_X), padding = True, truncation=True)

In [43]:
train_tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [44]:
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

[101, 5871, 2158, 3295, 2511, 112, 176, 16423, 3254, 16156, 5871, 2158, 3501, 14516, 10182, 1197, 8228, 2105, 1197, 2774, 5871, 2158, 5871, 2158, 8228, 2105, 1197, 2774, 3501, 14516, 10182, 1197, 3254, 16156, 5871, 2158, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [45]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = train_X
            self.tokens = train_tokens
            self.labels = list(train_Y)
        else:
            self.text_data = test_X
            self.tokens = test_tokens
            self.labels = list(test_Y)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [46]:
batch_size = 32
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [48]:
from torch.optim import AdamW
from transformers import BertForSequenceClassification
bert_model = BertForSequenceClassification.from_pretrained('bert-base-cased') # Pre-trained model
optimizer = AdamW(bert_model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
num_epochs = 3
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
# device = "cpu"
bert_model.to(device) # Transfer model to GPU if available

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [50]:
for epoch in range(num_epochs):
    print("Epoch: ",(epoch + 1))
    # TRAINING BLOCK STARTS
    bert_model.train()
    for i,batch in enumerate(train_loader):    
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Setting the gradients to zero
        optimizer.zero_grad()
        
        # Passing the data to the model
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # The logits will be used for measuring the loss
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])

        # Calculating the gradient for the loss function
        loss.backward()
        
        # Optimizing the parameters of the bert model
        optimizer.step()

        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size

        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS 

    # TESTING BLOCK STARTS
    bert_model.eval()
    correct = 0
    test_pred = []
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
        
        # Logits act as predictions
        logits = outputs.logits
        
        # Calculating total batch loss using the logits and labels
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
        
        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
        
        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))
    
    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)
    # TESTING BLOCK ENDS

Epoch:  1


IndexError: Target 3 is out of bounds.

In [52]:
for i,batch in enumerate(train_loader):    
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
    break
    
        

{'input_ids': tensor([[  101,  1137,  1968,  ...,     0,     0,     0],
        [  101, 12488, 10401,  ...,     0,     0,     0],
        [  101,  2936,  1782,  ...,     0,     0,     0],
        ...,
        [  101,  1112,  1358,  ...,     0,     0,     0],
        [  101,  1821,  1403,  ...,     0,     0,     0],
        [  101,  8468,  4578,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([3, 2, 0, 3, 0, 1, 2, 3, 0, 2, 1, 1, 3, 0, 2, 2, 0, 3, 0, 3, 2, 3, 2, 2,
        1, 0, 3, 2, 2, 2, 2, 3])}
