https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

In [1]:
!pip install numpy pandas scikit-learn transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

## 讀取資料

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf


In [4]:
df_train = pd.read_csv("twitter_training.csv",  header=None, names=['id','entity','sentiment','text'])
df_train

Unnamed: 0,id,entity,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
df_train = df_train.drop(['id', 'entity'], axis=1)
df_train = df_train.dropna()
df_train

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [6]:
df_train_POS = df_train[df_train["sentiment"] == "Positive"]
df_train_NEG = df_train[df_train["sentiment"] == "Negative"]
df_train_NEU = df_train[df_train["sentiment"] == "Neutral"]

In [7]:
df_train_POS

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74677,Positive,Just realized that the Windows partition of my...
74678,Positive,Just realized that my Mac window partition is ...
74679,Positive,Just realized the windows partition of my Mac ...
74680,Positive,Just realized between the windows partition of...


In [8]:
df_train_NEG

Unnamed: 0,sentiment,text
24,Negative,the biggest dissappoinment in my life came out...
25,Negative,The biggest disappointment of my life came a y...
26,Negative,The biggest disappointment of my life came a y...
27,Negative,the biggest dissappoinment in my life coming o...
28,Negative,For the biggest male dissappoinment in my life...
...,...,...
74665,Negative,Nvidia really delayed the 3070 by 2 weeks.
74666,Negative,Nvidia did delay by 3070 2 weeks.
74667,Negative,Nvidia really delayed the 3070 several weeks.
74668,Negative,Nvidia really only delayed the 3070 2 flight w...


In [9]:
df_train_NEU

Unnamed: 0,sentiment,text
12,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
13,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
14,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
15,Neutral,"Rock-Hard La Vita, RARE BUT POWERFUL, HANDSOME..."
16,Neutral,"Live Rock - Hard music La la Varlope, RARE & t..."
...,...,...
74659,Neutral,"Nvidia plans to release its 2017 ""Crypto Craze..."
74660,Neutral,"Nvidia does not want to give up its ""cryptoins..."
74661,Neutral,Nvidia doesn’t intend to give away its 2017 ad...
74662,Neutral,Nvidia therefore doesn ’ t want to give up its...


## 對每個情感類別進行取樣

In [10]:
df_train_POS = df_train_POS.sample(n=min(1000, len(df_train_POS)), random_state=42)
df_train_NEG = df_train_NEG.sample(n=min(1000, len(df_train_NEG)), random_state=42)
df_train_NEU = df_train_NEU.sample(n=min(2000, len(df_train_NEU)), random_state=42)

In [11]:
df_train = pd.concat([df_train_POS, df_train_NEG, df_train_NEU])
df_train

Unnamed: 0,sentiment,text
7614,Positive,"My birthday consisted of crab legs, watching A..."
54738,Positive,An excellent thread
822,Positive,. LIVE NOW!. . I feel like I have not streame...
27356,Positive,Procrastination with punk gets a bit ridiculou...
38029,Positive,Did you guys play Herdstein? because I'm obses...
...,...,...
39612,Neutral,People eat eight spiders in their sleep per ye...
67700,Neutral,Join Johnson & Johnson Healthy Essentials Rewa...
36459,Neutral,Slack files EU competition complaint against H...
52910,Neutral,. LIVE NOW


## 文本數據和標籤分開

In [12]:
X = df_train['text']
y = df_train['sentiment']

## 標籤進行one-hot編碼


In [13]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)

# 將y轉換為numpy並調整形狀
y_reshaped = y.values.reshape(-1, 1)

# 對標籤進行one hot編碼
onehot_encoded = encoder.fit_transform(y_reshaped)
onehot_encoded



array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

## 分割訓練集和測試集

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, onehot_encoded, test_size=0.2, random_state=42)


## Callback function

In [15]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [16]:
check_point = tf.keras.callbacks.ModelCheckpoint(
    '/content/checkpoint.keras',  # 指定檔案路徑和檔名
    monitor="val_loss",
    verbose=0,
    save_best_only=False,
    save_weights_only=False,
    mode="auto",
    save_freq="epoch"
)

In [17]:
def scheduler(epoch, lr):
    if epoch < 12:
        return float(lr)
    else:
        return float(lr * tf.math.exp(-0.1))

In [18]:
LR_Scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

# MODEL

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [20]:
class TextDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts # 文本數據列表
        self.labels = labels # 標籤列表
        self.tokenizer = tokenizer # 文本數據的分詞器
        self.max_len = max_len # 文本分詞後的最大長度

    def __len__(self):
        return len(self.texts)


    def __getitem__(self, idx):
        text = self.texts[idx] # 索取索引[idx]的文本
        label = self.labels[idx] # 索取索引[idx]的標籤
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,  # 原始文本
            'input_ids': encoding['input_ids'].flatten(),  # 編碼後的input_ids
            'attention_mask': encoding['attention_mask'].flatten(),  # 編碼後的attention mask
            'label': torch.tensor(label, dtype=torch.float)  # 將標籤轉換為浮點型張量
        }

In [21]:
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = TextDataset(
        texts=texts,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

# 定義批次大小和文本的最大長度
BATCH_SIZE = 16
MAX_LEN = 128

# 使用create_data_loader函數創建訓練數據加載器
train_data_loader = create_data_loader(X_train.to_numpy(), y_train, tokenizer, MAX_LEN, BATCH_SIZE)

# 使用create_data_loader函數創建測試數據加載器
test_data_loader = create_data_loader(X_test.to_numpy(), y_test, tokenizer, MAX_LEN, BATCH_SIZE)




In [22]:
# 定義模型
class SentimentClassifier(nn.Module):

    def __init__(self, bert_model):
        super(SentimentClassifier, self).__init__()
        self.bert_model = bert_model  # 預訓練的BERT模型
        self.drop = nn.Dropout(p=0.3)  # dropout層，防止過擬合
        self.fc1 = nn.Linear(3072, 128)  # 全連接層，輸入大小3072，輸出大小128
        self.fc2 = nn.Linear(128, 3)  # 全連接層，輸入大小128，輸出大小3（對應3個類別）
        self.softmax = nn.Softmax(dim=1)  # softmax層，用於多類分類

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states[-4:]  # 獲取最後四層隱藏狀態
        concat_hidden_states = torch.cat(hidden_states, dim=-1)  # 將隱藏狀態在最後一個維度上拼接
        pooled_output = torch.mean(concat_hidden_states, dim=1)  # 對隱藏狀態取平均，得到池化輸出
        x = self.drop(pooled_output)  # 應用dropout
        x = self.fc1(x)  # 通過第一個全連接層
        x = torch.relu(x)  # 使用ReLU激活函數
        x = self.fc2(x)  # 通過第二個全連接層
        return self.softmax(x)  # 使用softmax得到分類概率

# 初始化模型
model = SentimentClassifier(bert_model)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to('cuda' if torch.cuda.is_available() else 'cpu')


In [23]:
# 訓練模型

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        # 前向傳播
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # 獲取預測結果
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, torch.max(labels, 1)[1])

        # 計算正確預測的數量
        correct_predictions += torch.sum(preds == torch.max(labels, 1)[1])
        losses.append(loss.item())

        # 反向傳播和優化
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # 返回訓練準確率和平均損失
    return correct_predictions.double() / n_examples, np.mean(losses)

In [24]:
# 評估模型
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            # 前向傳播
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # 獲取預測結果
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, torch.max(labels, 1)[1])

            # 計算正確預測的數量
            correct_predictions += torch.sum(preds == torch.max(labels, 1)[1])
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)


In [25]:
EPOCHS = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 訓練和評估模型
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        len(X_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        test_data_loader,
        loss_fn,
        device,
        len(X_test)
    )

    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()


Epoch 1/3
----------


  self.pid = os.fork()


Train loss 0.9458254635334015 accuracy 0.5821875
Val loss 0.8845552742481232 accuracy 0.67375

Epoch 2/3
----------
Train loss 0.8020833551883697 accuracy 0.7471875
Val loss 0.8522906434535981 accuracy 0.68625

Epoch 3/3
----------
Train loss 0.7331362965703011 accuracy 0.815625
Val loss 0.8508601212501525 accuracy 0.6875



In [26]:
# 預測和評估模型
y_pred = []
y_true = []


model = model.eval()

with torch.no_grad():
    for data in test_data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        y_pred.extend(preds.cpu().numpy())
        y_true.extend(torch.max(labels, 1)[1].cpu().numpy())

print(classification_report(y_true, y_pred, target_names=encoder.categories_[0]))

              precision    recall  f1-score   support

    Negative       0.69      0.66      0.67       213
     Neutral       0.71      0.77      0.74       378
    Positive       0.64      0.57      0.60       209

    accuracy                           0.69       800
   macro avg       0.68      0.67      0.67       800
weighted avg       0.69      0.69      0.69       800

