In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from modules import *

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\QYH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\QYH\AppD

In [8]:
review1=list(pd.read_csv("data/labeledTrainData.tsv",sep = "\t")['review'].apply(lambda x: x.strip().split(" ")))
review2=list(pd.read_csv("data/testData.tsv",sep = "\t")['review'].apply(lambda x: x.strip().split(" ")))

In [10]:
review3=list(pd.read_csv("data/unlabeledTrainData.tsv",sep = "\t",error_bad_lines=False)['review'].apply(lambda x: x.strip().split(" ")))

b'Skipping line 43043: expected 2 fields, saw 3\n'


In [11]:
import os
from gensim.models import Word2Vec
model = Word2Vec(review1+review2+review3, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
print("saving model ...")
#model.save(os.path.join(model, 'word2vec_all.model'))
model.save(os.path.join('model', 'word2vec_all.model'))

saving model ...


TypeError: expected str, bytes or os.PathLike object, not Word2Vec

In [13]:
import torch
from torch import nn

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path=os.path.join('model', 'word2vec_all.model')):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    def get_w2v_model(self):
        # Load the previously trained word2vec model
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size

    # Add "<PAD>" or "<UNK>" to the embedding
    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)

    def make_embedding(self, load=True):
        print("Get embedding,loading word2vec model ...")
        self.get_w2v_model()
        # 製作一個 word2idx 的 dictionary
        # 製作一個 idx2word 的 list
        # 製作一個 word2vector 的 list
        for i, word in enumerate(self.embedding.wv.vocab):
            print('get words #{}'.format(i+1), end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)

        # Add "<PAD>" and "<UNK>" to the embedding
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    # Make each sentence the same length
    def pad_sequence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self):
        # Convert the words in the sentence to the corresponding index
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            # Make each sentence the same length
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # labels --> tensor
        y = [int(label) for label in y]

        return torch.LongTensor(y)

## Dataset

In [33]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

## Model

In [15]:
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.2):
        super(LSTM_Net, self).__init__()
        # embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        # Initialize the embedding using word2vec
        self.embedding.weight = torch.nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False
        self.embedding_dim = embedding.size(1)

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # x 的 dimension (batch, seq_len, hidden_size)
        # 取用 LSTM 最後一層的 hidden state
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

## Train

In [37]:
df= pd.read_csv("data/labeledTrainData.tsv",sep = "\t")
X = df['review'].apply(lambda x: x.strip().split(" "))
X = list(X)
y = df["sentiment"]
y = list(y)

In [38]:
from tqdm import tqdm
from torch import optim

def evaluation(outputs, labels):
    #outputs => probability (float)
    #labels => labels
    outputs[outputs>=0.5] = 1
    outputs[outputs<0.5] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


preprocess = Preprocess(X, sen_len=200, w2v_path=os.path.join('model', 'word2vec_all.model'))
embedding = preprocess.make_embedding(load=True)
X = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

split_num = int(len(X)*0.8)
X_train, X_val, y_train, y_val = X[:split_num], X[split_num:], y[:split_num], y[split_num:]

# dataloader
train_dataset = MyDataset(X=X_train, y=y_train)
val_dataset = MyDataset(X=X_val, y=y_val)


model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=250, num_layers=1, dropout=0.4)


batch=128
train_loader=DataLoader(dataset = train_dataset,batch_size =batch,shuffle = True,num_workers = 0)
val_loader=DataLoader(dataset = val_dataset,batch_size =batch,shuffle = True,num_workers = 0)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model=model.cuda()
best_acc=0
for epoch in range(30):
    model.train()
    total_loss=0
    for i,(inputs, labels) in tqdm(enumerate(train_loader)):
         optimizer.zero_grad()
         inputs = inputs.to(device, dtype=torch.long)
         labels = labels.to(device, dtype=torch.float)
         outputs = model(inputs)
         outputs = outputs.squeeze()
         loss = criterion(outputs, labels)
         loss.backward()
         optimizer.step()
         total_loss += loss.item()

    print(f'epoch{epoch} train ','Loss:{:.5f}'.format(total_loss/len(train_loader)))

    #eval
    model.eval()
    with torch.no_grad():
        total_loss, total_acc = 0, 0
        for i, (inputs, labels) in tqdm(enumerate(val_loader)):
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            correct = evaluation(outputs, labels)
            total_acc += (correct / batch)
            total_loss += loss.item()
        print(f'epoch{epoch} eval ','Loss:{:.5f}'.format(total_loss/len(val_loader)),'acc:{:.5f}'.format(total_acc/len(val_loader)))
        if total_acc/len(val_loader)>best_acc:
            best_acc=total_acc/len(val_loader)
            print('save')
            torch.save(model, "model/LSTM_all_data.model")

Get embedding,loading word2vec model ...
get words #121343
total words: 121345
sentence count #25000

157it [00:08, 19.05it/s]


epoch0 train  Loss:0.69125


40it [00:00, 57.21it/s]


epoch0 eval  Loss:0.69233 acc:0.49414
save


157it [00:08, 19.46it/s]


epoch1 train  Loss:0.68559


40it [00:00, 56.09it/s]


epoch1 eval  Loss:0.69258 acc:0.50313
save


157it [00:08, 19.24it/s]


epoch2 train  Loss:0.69332


40it [00:00, 56.25it/s]


epoch2 eval  Loss:0.69405 acc:0.50078


157it [00:07, 19.71it/s]


epoch3 train  Loss:0.69334


40it [00:00, 56.64it/s]


epoch3 eval  Loss:0.69262 acc:0.49316


157it [00:07, 19.74it/s]


epoch4 train  Loss:0.69046


40it [00:00, 56.41it/s]


epoch4 eval  Loss:0.68440 acc:0.51953
save


157it [00:08, 19.00it/s]


epoch5 train  Loss:0.69187


40it [00:00, 55.53it/s]


epoch5 eval  Loss:0.68345 acc:0.55195
save


157it [00:08, 19.11it/s]


epoch6 train  Loss:0.68318


40it [00:00, 54.71it/s]


epoch6 eval  Loss:0.68089 acc:0.53301


157it [00:08, 19.54it/s]


epoch7 train  Loss:0.68507


40it [00:00, 55.63it/s]


epoch7 eval  Loss:0.67301 acc:0.57441
save


157it [00:08, 19.19it/s]


epoch8 train  Loss:0.63069


40it [00:00, 55.02it/s]


epoch8 eval  Loss:0.45124 acc:0.79199
save


157it [00:08, 18.83it/s]


epoch9 train  Loss:0.60003


40it [00:00, 55.78it/s]


epoch9 eval  Loss:0.51941 acc:0.74258


157it [00:08, 19.39it/s]


epoch10 train  Loss:0.42220


40it [00:00, 55.16it/s]


epoch10 eval  Loss:0.40947 acc:0.80508
save


157it [00:08, 18.94it/s]


epoch11 train  Loss:0.38146


40it [00:00, 55.17it/s]


epoch11 eval  Loss:0.35922 acc:0.82852
save


157it [00:08, 19.11it/s]


epoch12 train  Loss:0.35195


40it [00:00, 55.24it/s]


epoch12 eval  Loss:0.34473 acc:0.83359
save


157it [00:08, 19.00it/s]


epoch13 train  Loss:0.32337


40it [00:00, 55.17it/s]


epoch13 eval  Loss:0.30513 acc:0.85078
save


157it [00:08, 18.96it/s]


epoch14 train  Loss:0.30523


40it [00:00, 54.77it/s]


epoch14 eval  Loss:0.33088 acc:0.83867


157it [00:08, 19.28it/s]


epoch15 train  Loss:0.29747


40it [00:00, 55.17it/s]


epoch15 eval  Loss:0.30296 acc:0.85391
save


157it [00:08, 18.82it/s]


epoch16 train  Loss:0.27831


40it [00:00, 54.79it/s]


epoch16 eval  Loss:0.29719 acc:0.85527
save


157it [00:08, 18.77it/s]


epoch17 train  Loss:0.27631


40it [00:00, 55.39it/s]


epoch17 eval  Loss:0.32063 acc:0.84688


157it [00:08, 19.11it/s]


epoch18 train  Loss:0.26101


40it [00:00, 54.13it/s]


epoch18 eval  Loss:0.29177 acc:0.85625
save


157it [00:08, 18.58it/s]


epoch19 train  Loss:0.25505


40it [00:00, 53.55it/s]


epoch19 eval  Loss:0.29225 acc:0.85449


157it [00:08, 19.11it/s]


epoch20 train  Loss:0.24442


40it [00:00, 54.12it/s]


epoch20 eval  Loss:0.30176 acc:0.85879
save


157it [00:08, 18.79it/s]


epoch21 train  Loss:0.22847


40it [00:00, 54.64it/s]


epoch21 eval  Loss:0.31361 acc:0.85156


157it [00:08, 19.00it/s]


epoch22 train  Loss:0.22017


40it [00:00, 53.76it/s]


epoch22 eval  Loss:0.30613 acc:0.85801


157it [00:08, 18.96it/s]


epoch23 train  Loss:0.20730


40it [00:00, 54.42it/s]


epoch23 eval  Loss:0.32652 acc:0.85273


157it [00:08, 19.02it/s]


epoch24 train  Loss:0.19683


40it [00:00, 54.42it/s]


epoch24 eval  Loss:0.35362 acc:0.85000


157it [00:08, 18.96it/s]


epoch25 train  Loss:0.18054


40it [00:00, 54.49it/s]


epoch25 eval  Loss:0.38449 acc:0.84512


157it [00:08, 18.96it/s]


epoch26 train  Loss:0.16862


40it [00:00, 54.05it/s]


epoch26 eval  Loss:0.36737 acc:0.84473


157it [00:08, 19.02it/s]


epoch27 train  Loss:0.15364


40it [00:00, 54.38it/s]


epoch27 eval  Loss:0.37971 acc:0.83848


157it [00:08, 18.99it/s]


epoch28 train  Loss:0.13722


40it [00:00, 54.34it/s]


epoch28 eval  Loss:0.41170 acc:0.83789


157it [00:08, 18.90it/s]


epoch29 train  Loss:0.12925


40it [00:00, 54.27it/s]

epoch29 eval  Loss:0.46033 acc:0.83496





In [42]:
## test
test_data=pd.read_csv("data/testData.tsv",sep = "\t")
X = test_data['review'].apply(lambda x: x.strip().split(" "))
X = list(X)
preprocess = Preprocess(X, sen_len=200)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = MyDataset(X=test_x, y=None)
test_loader = DataLoader(dataset = test_dataset,batch_size =batch,shuffle = False,num_workers = 0)
print('\nload model ...')
model = torch.load(os.path.join('model', 'LSTM_all_data.model'))
model.eval()
ret_output = []
with torch.no_grad():
    for i, inputs in enumerate(test_loader):
        inputs = inputs.to(device, dtype=torch.long)
        outputs = model(inputs)
        outputs = outputs.squeeze()
        ret_output += outputs.float().tolist()
tmp = pd.DataFrame({"id":test_data['id'],"sentiment":ret_output})
print("save csv ...")
tmp.to_csv(os.path.join('out', 'LSTM_all.csv'), index=False)
print("Finish Predicting")

Get embedding,loading word2vec model ...
get words #121343
total words: 121345
sentence count #25000
load model ...
save csv ...
Finish Predicting
