# Sentiment Analysis with Deep Learning using BERT

## Exploratory Data Analysis and Preprocessing

In [3]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
df = pd.read_csv('train.csv',encoding='unicode_escape')

In [5]:
df = df[['text', 'sentiment']]

In [6]:
df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [7]:
df.isna().sum()

text         1
sentiment    0
dtype: int64

In [8]:
df.dropna(inplace=True)
df.isna().sum()

text         0
sentiment    0
dtype: int64

In [9]:
df.columns

Index(['text', 'sentiment'], dtype='object')

In [10]:
df

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [11]:
import re
def normalize_text(text):
    if isinstance(text, str):  # 텍스트가 문자열인지 확인
        text = text.lower()  # 모든 문자를 소문자로 변환
        text = re.sub(r'[^\w\s]', '', text)  # 단어 문자와 공백을 제외한 모든 문자 제거
        text = re.sub(r'\s+', ' ', text).strip()  # 중복된 공백 제거 및 양 끝 공백 제거
    else:
        text = str(text)  # 문자열이 아닌 경우 문자열로 변환
    return text
df['text'] = df['text'].apply(normalize_text)

In [12]:
df

Unnamed: 0,text,sentiment
0,id have responded if i were going,neutral
1,sooo sad i will miss you here in san diego,negative
2,my boss is bullying me,negative
3,what interview leave me alone,negative
4,sons of why couldnt they put them on the relea...,negative
...,...,...
27476,wish we could come see u on denver husband los...,negative
27477,ive wondered about rake to the client has made...,negative
27478,yay good for both of you enjoy the break you p...,positive
27479,but it was worth it,positive


In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/20223203/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
def remove_stopwords(text):
    if isinstance(text, str):
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
        filtered_text = ' '.join(filtered_words)
    else:
        filtered_text = ''
    return filtered_text
df['text'] = df['text'].apply(remove_stopwords)

In [16]:
df

Unnamed: 0,text,sentiment
0,id responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons couldnt put releases already bought,negative
...,...,...
27476,wish could come see u denver husband lost job ...,negative
27477,ive wondered rake client made clear net dont f...,negative
27478,yay good enjoy break probably need hectic week...,positive
27479,worth,positive


In [17]:
possible_labels = df.sentiment.unique()
possible_labels

array(['neutral', 'negative', 'positive'], dtype=object)

In [18]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

label_dict

{'neutral': 0, 'negative': 1, 'positive': 2}

In [27]:
# df['label'] = df.sentiment.replace(label_dict)
df_test['label'] = df_test['sentiment'].replace(label_dict)
df_test['label'] = df_test['label'].infer_objects(copy=False)

  df_test['label'] = df_test['sentiment'].replace(label_dict)


In [29]:
df

Unnamed: 0,text,sentiment,label
0,id responded going,neutral,0
1,sooo sad miss san diego,negative,1
2,boss bullying,negative,1
3,interview leave alone,negative,1
4,sons couldnt put releases already bought,negative,1
...,...,...,...
27476,wish could come see u denver husband lost job ...,negative,1
27477,ive wondered rake client made clear net dont f...,negative,1
27478,yay good enjoy break probably need hectic week...,positive,2
27479,worth,positive,2


 test dataset도 동일하게 전처리

In [21]:
df_test = pd.read_csv('test.csv',encoding='unicode_escape')

In [22]:
# 텍스트 열과 감정 열에서 NaN 값 확인
print(df_test['text'].isna().sum())       # text 열의 NaN 개수 확인
print(df_test['sentiment'].isna().sum())  # sentiment 열의 NaN 개수 확인

1281
1281


In [23]:
# NaN 값 삭제
df_test.dropna(subset=['text', 'sentiment'], inplace=True)

# 텍스트 열이 비어 있는 행 삭제
df_test = df_test[df_test['text'].str.strip() != '']

# 텍스트 데이터 전처리
df_test['text'] = df_test['text'].apply(normalize_text)
df_test['text'] = df_test['text'].apply(remove_stopwords)


# 필요한 열만 유지
df_test = df_test[['text', 'sentiment']]

#레이블 변환
df_test['label'] = df_test.sentiment.replace(label_dict)

  df_test['label'] = df_test.sentiment.replace(label_dict)


In [24]:
df_test

Unnamed: 0,text,sentiment,label
0,last session day httptwitpiccom67ezh,neutral,0
1,shanghai also really exciting precisely skyscr...,positive,2
2,recession hit veronique branquinho quit compan...,negative,1
3,happy bday,positive,2
4,httptwitpiccom4w75p like,positive,2
...,...,...,...
3529,3 im tired cant sleep try,negative,1
3530,alone old house thanks net keeps alive kicking...,positive,2
3531,know mean little dog sinking depression wants ...,negative,1
3532,_sutra next youtube video gonna love videos,positive,2


## Loading Tokenizer and Encoding our Data

In [25]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

토큰화 및 데이터 인코딩

In [31]:
encoded_data_train = tokenizer.batch_encode_plus(
    df.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    df_test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df.label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df_test.label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [32]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

test_data = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [33]:
len(dataset_train)

27480

---------





## Setting up BERT Pretrained Model

In [34]:
from transformers import BertForSequenceClassification

In [35]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Creating Data Loaders

In [36]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [37]:
batch_size = 32

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_test = DataLoader(test_data, batch_size=batch_size, shuffle=True)

## Setting Up Optimiser and Scheduler

In [38]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [39]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)



In [40]:
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

## Defining our Performance Metrics

Accuracy metric approach originally used in accuracy function in [this tutorial](https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification).

In [41]:
import numpy as np

In [42]:
from sklearn.metrics import f1_score

In [43]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [44]:
def accuracy_per_class_and_overall(preds, labels):
    # Invert the label dictionary to map labels to class names
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    # Flatten the predictions and labels
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    total_correct = 0
    total_count = 0

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        class_correct = len(y_preds[y_preds == label])
        class_total = len(y_true)
        total_correct += class_correct
        total_count += class_total

        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {class_correct}/{class_total} ({class_correct/class_total:.2%})\n')

    # Calculate overall accuracy
    overall_accuracy = total_correct / total_count
    print(f'Overall Accuracy: {total_correct}/{total_count} ({overall_accuracy:.2%})\n')

    return overall_accuracy

## Creating our Training Loop

Approach adapted from an older version of HuggingFace's `run_glue.py` script. Accessible [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128).

In [45]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [47]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [48]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/859 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.7098664466174573


Epoch 2:   0%|          | 0/859 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.562000338495541


Epoch 3:   0%|          | 0/859 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.49661810519184585


Epoch 4:   0%|          | 0/859 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.44395392925865573


Epoch 5:   0%|          | 0/859 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.4060633751250396


In [53]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [54]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [55]:
_, predictions, true_test = evaluate(dataloader_test)

In [57]:
accuracy_per_class_and_overall(predictions, true_test)

Class: neutral
Accuracy: 1045/1430 (73.08%)

Class: negative
Accuracy: 759/1001 (75.82%)

Class: positive
Accuracy: 885/1103 (80.24%)

Overall Accuracy: 2689/3534 (76.09%)



0.7608941709111489

In [64]:
# 에폭별로 모델을 로드하여 테스트
for i in range(4):
    epoch = i+1
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

    model.to(device)
    model.load_state_dict(torch.load(f'finetuned_BERT_epoch_{epoch}.model', map_location=torch.device('cpu')))
    print(f'{epoch} epoch results')
    _, predictions, true_test = evaluate(dataloader_test)
    accuracy_per_class_and_overall(predictions, true_test)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1 epoch results
Class: neutral
Accuracy: 1057/1430 (73.92%)

Class: negative
Accuracy: 695/1001 (69.43%)

Class: positive
Accuracy: 908/1103 (82.32%)

Overall Accuracy: 2660/3534 (75.27%)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2 epoch results
Class: neutral
Accuracy: 1029/1430 (71.96%)

Class: negative
Accuracy: 770/1001 (76.92%)

Class: positive
Accuracy: 885/1103 (80.24%)

Overall Accuracy: 2684/3534 (75.95%)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3 epoch results
Class: neutral
Accuracy: 1008/1430 (70.49%)

Class: negative
Accuracy: 799/1001 (79.82%)

Class: positive
Accuracy: 872/1103 (79.06%)

Overall Accuracy: 2679/3534 (75.81%)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4 epoch results
Class: neutral
Accuracy: 1047/1430 (73.22%)

Class: negative
Accuracy: 770/1001 (76.92%)

Class: positive
Accuracy: 870/1103 (78.88%)

Overall Accuracy: 2687/3534 (76.03%)

