数据预处理

In [None]:
import pandas as pd
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head()

（1）将所有的大小写统一为小写字母

In [2]:
def lowercase_text(text):
    text = text.lower()
    return text

train_data['text'] = train_data['text'].apply(lambda x: lowercase_text(x))
test_data['text'] = test_data['text'].apply(lambda x: lowercase_text(x))

In [3]:
train_data['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

（2）去除噪声（及非字母的无用字符）

In [4]:
import re
import string
# Removing punctuation, html tags, symbols, numbers, etc.
def remove_noise(text):
    # Dealing with Punctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [5]:
train_data['text'] = train_data['text'].apply(lambda x: remove_noise(x))
test_data['text'] = test_data['text'].apply(lambda x: remove_noise(x))

In [6]:
train_data['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

(3)去除停用词（即词、介词、副词或连词等）

In [7]:
from nlppreprocess import NLP
import nltk

nlp = NLP()

train_data['text'] = train_data['text'].apply(nlp.process)
test_data['text'] = test_data['text'].apply(nlp.process) 

(4)将所有词转换为根形态

In [8]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def stemming(text):
    text = [stemmer.stem(word) for word in text.split()]
    return ' '.join(text)

train_data['text'] = train_data['text'].apply(stemming)
test_data['text'] = test_data['text'].apply(stemming)

In [9]:
from sklearn.model_selection import train_test_split
# create train and validation set 

train_data.to_csv("train_clean_1.csv",index=False)
train, val = train_test_split(train_data, test_size=0.1)
train.to_csv("train_1.csv", index=False)
val.to_csv("val_1.csv", index=False)
test_data.to_csv("test_1.csv",index=False)

In [10]:
import spacy
import torch
from torchtext import data, datasets
from torchtext.vocab import Vectors
from torch.nn import init

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [11]:
spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text): # create a tokenizer function
    """
    定义分词操作
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

"""
field在默认的情况下都期望一个输入是一组单词的序列，并且将单词映射成整数。
这个映射被称为vocab。如果一个field已经被数字化了并且不需要被序列化，
可以将参数设置为use_vocab=False以及sequential=False。
"""
LABEL = data.LabelField(dtype=torch.float)

TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)



In [12]:
train,val = data.TabularDataset.splits(
        path='.',train='train_1.csv',validation='val_1.csv', format='csv',skip_header=True,
        fields=[('id',None),('keyword',None),('location', None), ('text', TEXT),('target',LABEL)])

test = data.TabularDataset('test_1.csv', format='csv',skip_header=True,
        fields=[('id',None),('keyword',None),('location', None), ('text', TEXT)])


查看生成的dataset：

In [13]:
print(test[0])
print(test[0].__dict__.keys())
print(test[0].text)


<torchtext.data.example.Example object at 0x000002035D6DD978>
dict_keys(['text'])
['just', 'happen', 'terribl', 'car', 'crash']


In [14]:
TEXT.build_vocab(train, max_size=25000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train)

In [15]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 12502
Unique tokens in LABEL vocabulary: 2


In [16]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test), 
    batch_size=BATCH_SIZE,
    device=device,
    sort=False)

In [17]:
import torch.nn as nn
import torch.nn.functional as F
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text)) #[sent len, batch size, emb dim]
        output, (hidden, cell) = self.rnn(embedded)
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) # [batch size, hid dim * num directions]
        return self.fc(hidden.squeeze(0))

In [18]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
            N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,560,857 trainable parameters


In [20]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0857, -0.2220,  0.1657,  ..., -0.0743,  0.7581, -0.3424],
        ...,
        [ 0.9511, -0.4373,  0.4993,  ...,  0.6739,  0.2300,  0.4647],
        [ 1.4546,  0.4323, -1.0947,  ...,  1.4238,  0.5110,  0.3144],
        [ 0.4593,  0.0251,  0.0319,  ..., -0.0136, -0.5006,  0.5798]])


In [21]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


In [22]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [23]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.target)
        acc = binary_accuracy(predictions, batch.target)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.target)
            acc = binary_accuracy(predictions, batch.target)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 100
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'lstm-model.pt')ii
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.563 | Train Acc: 71.76%
	 Val. Loss: 0.500 |  Val. Acc: 75.84%
Epoch: 02 | Epoch Time: 0m 4s
	Train Loss: 0.501 | Train Acc: 77.21%
	 Val. Loss: 0.447 |  Val. Acc: 79.76%
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.457 | Train Acc: 79.80%
	 Val. Loss: 0.427 |  Val. Acc: 81.52%
Epoch: 04 | Epoch Time: 0m 4s
	Train Loss: 0.430 | Train Acc: 80.81%
	 Val. Loss: 0.418 |  Val. Acc: 81.60%
Epoch: 05 | Epoch Time: 0m 4s
	Train Loss: 0.395 | Train Acc: 82.80%
	 Val. Loss: 0.424 |  Val. Acc: 81.92%
Epoch: 06 | Epoch Time: 0m 4s
	Train Loss: 0.374 | Train Acc: 84.32%
	 Val. Loss: 0.395 |  Val. Acc: 83.86%


In [None]:
# model.load_state_dict(torch.load('lstm-model.pt'))
# test_loss, test_acc = evaluate(model, test_iterator, criterion)
# print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load('lstm-model.pt'))
TARGET=[]
test_source = pd.read_csv('test.csv')
ID=test_source['id'].values
for batch in test:
    indexed = [TEXT.vocab.stoi[t] for t in batch.text]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    TARGET.append(round(prediction.item()))
submission = pd.DataFrame({'id': ID, 'target':TARGET })
submission.to_csv(r'pytorch_lstm.csv', index=False)
print('successful')