In [119]:
import warnings
warnings.filterwarnings('ignore')
import wandb
import pandas as pd
pd.set_option('max_colwidth',None)
from transformers import DistilBertModel, DistilBertTokenizer
import torch

In [2]:
# data = pd.read_csv("../input/data-for-aspect-based-sentimental-analysis/train.csv")
data = pd.read_csv('../data/raw/train.csv')

In [3]:
data.sample(10)

Unnamed: 0,text,aspect,label
2513,provide timely and better quality products,quality products,0
3052,"please feel free to reach out anytime if you have any questions, suggestions, or feedback!",free,1
3622,i was about to download a video project i barely finished and the site went blank and my work is gone.,download,0
1600,"i can not get the previews to load whether its with istock videos from yur library, or my own",preview,0
134,Productivity app doesn't have the ability to send push notifications on time...,push notifications,0
2837,1 ️ Dark Mode.,Dark Mode,1
2954,"Excellent App, I'm sorry I just didn't meet him before",meet,1
1476,"love this app , i use lotion (linux version of notion) and notion app which can be used together to sync data and write notes.",notion app,2
1103,i just purchased the plan and i would like you to have it activated please,purchased,1
677,the android app is so slow it is borderline unusable.,app,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4000 non-null   object
 1   aspect  4000 non-null   object
 2   label   4000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 93.9+ KB


In [5]:
# data['text'].head()
data['aspect'].head()

0       cancelled
1            Milk
2    notification
3            view
4            load
Name: aspect, dtype: object

In [6]:
data.head()

Unnamed: 0,text,aspect,label
0,can you check whether its cancelled completely?,cancelled,1
1,cannot rely on both milk delivery and grocery items.,Milk,0
2,"I get no notification, however the app is really fine",notification,0
3,"Love this app, but would love it even more if Gantt charts and Calendar view were available on iPhone!",view,1
4,it does not let me load a clip on the scene,load,0


In [103]:
class Dataset(torch.utils.data.Dataset):
    
    def __init__(self,text,aspect,tokenizer,max_len,labels=None):
        self.text = text 
        self.aspect = aspect
        self.label = labels
        self.tokenizer = tokenizer 
        self.max_len = max_len
    
        
    def __getitem__(self,idx) -> dict:
        texts = self.text[idx]
        aspects = self.aspect[idx]
        labels = self.label[idx]

        
        encodings = self.tokenizer.encode_plus(
            texts,
            aspects,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'texts-aspect': f"Text: {texts} Aspects: {aspects}",
            'texts-aspect': {'texts':texts,'aspects':aspects},
            'labels': torch.tensor(labels),
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        }
    
    
    def __len__(self) -> int:
        return len(self.text)

In [100]:
def dataloader(df,tokenizer,max_len,batch_size):
    dataset = Dataset(
        text = df['text'].to_numpy(),
        aspect = df['aspect'].to_numpy(),
        labels = df['label'].to_numpy(),
        tokenizer = tokenizer,
        max_len=max_len
    )
    
    return torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True
    )

In [9]:
shuffled_data = data.sample(frac=1)
train_data, val_data = shuffled_data[:3500], shuffled_data[3500:]

In [107]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [108]:
import joblib

joblib.dump(tokenizer,'tokenizer.joblib')

['tokenizer.joblib']

In [109]:
train_dataloader = dataloader(train_data, tokenizer, 1500, 32)
val_dataloader =  dataloader(val_data, tokenizer, 1500, 32)

In [120]:
BertModel = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [122]:
joblib.dump(BertModel,'distillbert.joblib')

['distillbert.joblib']

In [126]:
class Model(torch.nn.Module):

    def __init__(self,n_classes):
        super().__init__()
        self.distillbert = BertModel
        self.dropout = torch.nn.Dropout(p=0.2)
        self.h1 = torch.nn.Linear(self.distillbert.config.hidden_size, 128)
        self.output = torch.nn.Linear(128,n_classes)


    def forward(self,input_ids, attention_mask) -> torch.tensor:
        _, x = self.distillbert(input_ids = input_ids, attention_mask=attention_mask)
        x = self.dropout(x)
        x = torch.nn.functional.relu(self.h1(x))
        x = self.dropout(x)
        x = torch.nn.functional.log_softmax(self.output(x),dim=1)

        return x

In [127]:
model = Model(3)

In [128]:
model.parameters

<bound method Module.parameters of Model(
  (distillbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 