In [None]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from modules import *

import warnings
warnings.filterwarnings("ignore")

In [13]:
X,y=load_data('./data/labeledTrainData.tsv',colname=['review','sentiment'])
X=X.iloc[:, -1].apply(preprocess_data)

************** Loading Data ************


Summary of data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB
No of Rows: 25000
No of Columns: 3

Data View: Last 3 Instances

              id  sentiment                                             review
24997  "10905_3"          0  "Guy is a loser. Can't get girls, needs to bui...
24998  "10194_3"          0  "This 30 minute documentary Buñuel made in the...
24999   "8478_8"          1  "I saw this movie as a child and it broke my h...

Class Counts(label, row): Total
1    12500
0    12500
Name: sentiment, dtype: int64

Data View: First 5 Instances

         id  sentiment                                             review
0  "5814_8"   

In [16]:
import torch
from torch import nn
from gensim.models import Word2Vec

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./model/word2vec_train_data.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    def get_w2v_model(self):
        # Load the previously trained word2vec model
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size

    # Add "<PAD>" or "<UNK>" to the embedding
    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)

    def make_embedding(self, load=True):
        print("Get embedding,loading word2vec model ...")
        self.get_w2v_model()
        # 製作一個 word2idx 的 dictionary
        # 製作一個 idx2word 的 list
        # 製作一個 word2vector 的 list
        for i, word in enumerate(self.embedding.wv.vocab):
            print('get words #{}'.format(i+1), end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)

        # Add "<PAD>" and "<UNK>" to the embedding
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    # Make each sentence the same length
    def pad_sequence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self):
        # Convert the words in the sentence to the corresponding index
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            # Make each sentence the same length
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # labels --> tensor
        y = [int(label) for label in y]

        return torch.LongTensor(y)

## Dataset

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

## Model

In [None]:
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.2):
        super(LSTM_Net, self).__init__()
        # embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        # Initialize the embedding using word2vec
        self.embedding.weight = torch.nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False
        self.embedding_dim = embedding.size(1)

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 dimension (batch, seq_len, hidden_size)
        # 取用 LSTM 最後一層的 hidden state
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

## Train

In [14]:
from tqdm import tqdm
from torch import optim

def evaluation(outputs, labels):
    #outputs => probability (float)
    #labels => labels
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

preprocess = Preprocess(X, sen_len=200, w2v_path='./model/word2vec_train_data.model')
embedding = preprocess.make_embedding(load=True)
X = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

split_num = int(len(X)*0.8)
X_train, X_val, y_train, y_val = X[:split_num], X[split_num:], y[:split_num], y[split_num:]

# dataloader
train_dataset = MyDataset(X=X_train, y=y_train)
val_dataset = MyDataset(X=X_val, y=y_val)


model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=250, num_layers=1, dropout=0.4)


batch=128
train_loader=DataLoader(dataset = train_dataset,batch_size =batch,shuffle = True,num_workers = 0)
val_loader=DataLoader(dataset = val_dataset,batch_size =batch,shuffle = True,num_workers = 0)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model=model.cuda()
best_acc=0
for epoch in range(30):
    model.train()
    total_loss=0
    for i,(inputs, labels) in tqdm(enumerate(train_loader)):
         optimizer.zero_grad()
         inputs = inputs.to(device, dtype=torch.long)
         labels = labels.to(device, dtype=torch.float)
         outputs = model(inputs)
         outputs = outputs.squeeze()
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
         total_loss += loss.item()

    print(f'epoch{epoch} train ','Loss:{:.5f}'.format(total_loss/len(train_loader)))

    #eval
    model.eval()
    with torch.no_grad():
        total_loss, total_acc = 0, 0
        for i, (inputs, labels) in tqdm(enumerate(val_loader)):
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            correct = evaluation(outputs, labels)
            total_acc += (correct / batch)
            total_loss += loss.item()
        print(f'epoch{epoch} eval ','Loss:{:.5f}'.format(total_loss/len(val_loader)),'acc:{:.5f}'.format(total_acc/len(val_loader)))
        if total_acc/len(val_loader)>best_acc:
            best_acc=total_acc/len(val_loader)
            print('save')
            torch.save(model, "model/LSTM.model")

Get embedding,loading word2vec model ...
get words #21353
total words: 21355
sentence count #25000

157it [00:08, 19.44it/s]


epoch0 train  Loss:0.69390


40it [00:00, 57.38it/s]


epoch0 eval  Loss:0.69181 acc:0.50898
save


157it [00:07, 19.91it/s]


epoch1 train  Loss:0.69021


40it [00:00, 56.81it/s]


epoch1 eval  Loss:0.68911 acc:0.50508


157it [00:08, 19.52it/s]


epoch2 train  Loss:0.68801


40it [00:00, 55.19it/s]


epoch2 eval  Loss:0.68474 acc:0.51562
save


157it [00:07, 19.71it/s]


epoch3 train  Loss:0.68316


40it [00:00, 56.36it/s]


epoch3 eval  Loss:0.69686 acc:0.49375


157it [00:07, 19.70it/s]


epoch4 train  Loss:0.69068


40it [00:00, 54.53it/s]


epoch4 eval  Loss:0.69640 acc:0.48281


157it [00:08, 19.57it/s]


epoch5 train  Loss:0.69147


40it [00:00, 55.70it/s]


epoch5 eval  Loss:0.69150 acc:0.49883


157it [00:08, 19.55it/s]


epoch6 train  Loss:0.68823


40it [00:00, 54.92it/s]


epoch6 eval  Loss:0.69156 acc:0.50293


157it [00:08, 19.44it/s]


epoch7 train  Loss:0.69041


40it [00:00, 55.40it/s]


epoch7 eval  Loss:0.69097 acc:0.50371


157it [00:08, 19.34it/s]


epoch8 train  Loss:0.68821


40it [00:00, 55.32it/s]


epoch8 eval  Loss:0.68051 acc:0.51113


157it [00:08, 19.38it/s]


epoch9 train  Loss:0.68793


40it [00:00, 55.71it/s]


epoch9 eval  Loss:0.69292 acc:0.49785


157it [00:08, 19.34it/s]


epoch10 train  Loss:0.69236


40it [00:00, 54.34it/s]


epoch10 eval  Loss:0.68958 acc:0.49062


157it [00:08, 19.25it/s]


epoch11 train  Loss:0.68206


40it [00:00, 54.87it/s]


epoch11 eval  Loss:0.61951 acc:0.73281
save


157it [00:08, 19.20it/s]


epoch12 train  Loss:0.69775


40it [00:00, 54.71it/s]


epoch12 eval  Loss:0.68999 acc:0.50918


157it [00:08, 19.11it/s]


epoch13 train  Loss:0.66671


40it [00:00, 55.04it/s]


epoch13 eval  Loss:0.56573 acc:0.73574
save


157it [00:08, 19.06it/s]


epoch14 train  Loss:0.56236


40it [00:00, 54.89it/s]


epoch14 eval  Loss:0.44720 acc:0.80059
save


157it [00:08, 18.93it/s]


epoch15 train  Loss:0.37751


40it [00:00, 50.80it/s]


epoch15 eval  Loss:0.31776 acc:0.85313
save


157it [00:08, 19.05it/s]


epoch16 train  Loss:0.30897


40it [00:00, 54.64it/s]


epoch16 eval  Loss:0.29147 acc:0.86191
save


157it [00:08, 18.94it/s]


epoch17 train  Loss:0.29130


40it [00:00, 52.77it/s]


epoch17 eval  Loss:0.29135 acc:0.86387
save


157it [00:08, 18.87it/s]


epoch18 train  Loss:0.27727


40it [00:00, 54.13it/s]


epoch18 eval  Loss:0.28570 acc:0.86504
save


157it [00:08, 18.87it/s]


epoch19 train  Loss:0.26312


40it [00:00, 53.78it/s]


epoch19 eval  Loss:0.28093 acc:0.86582
save


157it [00:08, 18.92it/s]


epoch20 train  Loss:0.25151


40it [00:00, 53.05it/s]


epoch20 eval  Loss:0.30123 acc:0.85566


157it [00:08, 18.79it/s]


epoch21 train  Loss:0.24547


40it [00:00, 53.90it/s]


epoch21 eval  Loss:0.32212 acc:0.84414


157it [00:08, 18.81it/s]


epoch22 train  Loss:0.23792


40it [00:00, 54.35it/s]


epoch22 eval  Loss:0.30768 acc:0.86523


157it [00:08, 18.82it/s]


epoch23 train  Loss:0.22989


40it [00:00, 53.21it/s]


epoch23 eval  Loss:0.29044 acc:0.86562


157it [00:08, 18.78it/s]


epoch24 train  Loss:0.20672


40it [00:00, 53.62it/s]


epoch24 eval  Loss:0.28864 acc:0.86172


157it [00:08, 18.82it/s]


epoch25 train  Loss:0.19460


40it [00:00, 53.62it/s]


epoch25 eval  Loss:0.30705 acc:0.86602
save


157it [00:08, 18.86it/s]


epoch26 train  Loss:0.17160


40it [00:00, 53.98it/s]


epoch26 eval  Loss:0.35446 acc:0.84785


157it [00:08, 18.83it/s]


epoch27 train  Loss:0.15798


40it [00:00, 53.91it/s]


epoch27 eval  Loss:0.31890 acc:0.85859


157it [00:09, 16.22it/s]


epoch28 train  Loss:0.13695


40it [00:00, 53.91it/s]


epoch28 eval  Loss:0.39633 acc:0.85273


157it [00:08, 18.66it/s]


epoch29 train  Loss:0.11417


40it [00:00, 53.55it/s]

epoch29 eval  Loss:0.40642 acc:0.85313





In [18]:
import os

## test
test_data=pd.read_csv("data/testData.tsv",sep = "\t")
X = test_data['review'].apply(preprocess_data)
X = list(X)
preprocess = Preprocess(X, sen_len=200)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = MyDataset(X=test_x, y=None)
test_loader = DataLoader(dataset = test_dataset,batch_size =batch,shuffle = False,num_workers = 0)

print('\nload model ...')
model = torch.load(os.path.join('model', 'LSTM.model'))
model.eval()
ret_output = []
with torch.no_grad():
    for i, inputs in enumerate(test_loader):
        inputs = inputs.to(device, dtype=torch.long)
        outputs = model(inputs)
        outputs = outputs.squeeze()
        ret_output += outputs.float().tolist()
tmp = pd.DataFrame({"id":test_data['id'],"sentiment":ret_output})
print("save csv ...")
tmp.to_csv(os.path.join('out', 'LSTM_all.csv'), index=False)
print("Finish Predicting")

Get embedding,loading word2vec model ...
get words #21353
total words: 21355
sentence count #25000
load model ...
save csv ...
Finish Predicting
