## 1. 전처리

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, datasets
from torchtext.data import TabularDataset

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

import gensim

import string
import re

from spellchecker import SpellChecker

In [2]:
train = pd.read_csv("./train_test_data/train.csv")
test = pd.read_csv("./train_test_data/test.csv")

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
print(train.shape)
print(test.shape)
df = pd.concat([train, test])
print(df.shape)

(7613, 5)
(3263, 4)
(10876, 5)


In [5]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('', text)

example = "New competition launched: http://www.kaggle.com/c/nlp-getting-started"

remove_URL(example)

'New competition launched: '

In [6]:
df['text'] = df['text'].apply(lambda x : remove_URL(x))

In [7]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub('', text)

example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

print(remove_html(example))


Real or Fake
Kaggle 
getting started



In [8]:
df['text'] = df['text'].apply(lambda x: remove_html(x))

In [9]:
def remove_punct(text):
    table = str.maketrans("", "",string.punctuation)#(치환되는 문자, 치환하는 문자, 삭제할 문자)
    return text.translate(table)

example = "I am a #king"
print(remove_punct(example))

I am a king


In [10]:
df['text'] = df['text'].apply(lambda x : remove_punct(x))

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

'Omg another Earthquake '

In [12]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [13]:
df = df.drop(["id", "keyword", "location"], axis = 1)

In [14]:
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake Ma...,1.0
1,Forest fire near La Ronge Sask Canada,1.0
2,All residents asked to shelter in place are be...,1.0
3,13000 people receive wildfires evacuation orde...,1.0
4,Just got sent this photo from Ruby Alaska as s...,1.0


In [15]:
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1.0
1,forest fire near la ronge sask canada,1.0
2,all residents asked to shelter in place are be...,1.0
3,13000 people receive wildfires evacuation orde...,1.0
4,just got sent this photo from ruby alaska as s...,1.0


In [16]:
df["text"] = df['text'].str.replace("\d+", "")
df.head()

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1.0
1,forest fire near la ronge sask canada,1.0
2,all residents asked to shelter in place are be...,1.0
3,people receive wildfires evacuation orders in...,1.0
4,just got sent this photo from ruby alaska as s...,1.0


In [17]:
def correct_spellings(text):
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

text = "corect me plese"
print(correct_spellings(text))

correct me please


In [None]:
df['text']=df['text'].apply(lambda x : correct_spellings(x))

In [19]:
train_data = df[:7613]
test_data = df[7613:]
print(train_data.shape)
print(test_data.shape)
train_data.to_csv("train_data_fin.csv", index = False)
test_data.to_csv("test_data_fin.csv", index = False)

(7613, 2)
(3263, 2)


## 2. torchtext로 데이터 넘기기

In [18]:
def tokenizer(text):
    return word_tokenize(text)

In [19]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, batch_first=True)
LABEL = data.Field(sequential=False, batch_first = True, is_target = True)

In [20]:
train_data_torchtext, test_data_torchtext = TabularDataset.splits(path =".", train = "./train_test_data/train_data_fin.csv", test="./train_test_data/test_data_fin.csv", 
                        format="csv", fields=[("text", TEXT), ("label", LABEL)], skip_header=True)

In [21]:
print("훈련 샘플의 개수 :", len(train_data_torchtext))
print("테스트 샘플의 개수 :", len(test_data_torchtext))

훈련 샘플의 개수 : 7613
테스트 샘플의 개수 : 3263


In [22]:
print(vars(train_data_torchtext[0]))

{'text': ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all'], 'label': '1.0'}


In [23]:
TEXT.build_vocab(train_data_torchtext, min_freq=5, vectors="glove.6B.300d")
LABEL.build_vocab(train_data_torchtext)

In [24]:
print(len(TEXT.vocab.stoi))
print(len(LABEL.vocab.stoi))

2802
3


In [25]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)

cpu와 cuda 중 다음 기기로 학습함: cuda


In [26]:
train_data_torchtext, val_data_torchtext = train_data_torchtext.split(split_ratio=0.8)

In [27]:
train_iterator, val_iterator  = data.BucketIterator.splits((train_data_torchtext, val_data_torchtext),
                                                           batch_size = 64, shuffle = True, sort=False)
test_iterator = data.BucketIterator(test_data_torchtext, batch_size=len(test_data_torchtext), shuffle=False)

In [28]:
print("훈련 데이터의 미니 배치의 개수 :", len(train_iterator))
print("검증 데이터의 미니 배치의 개수 :", len(val_iterator))
print("테스트 데이터의 미니 배치의 개수 :", len(test_iterator))

훈련 데이터의 미니 배치의 개수 : 96
검증 데이터의 미니 배치의 개수 : 24
테스트 데이터의 미니 배치의 개수 : 1


In [29]:
batch = next(iter(train_iterator))
print(batch.text.shape)
Batch = next(iter(test_iterator))
print(Batch.text.shape)

torch.Size([64, 27])
torch.Size([3263, 31])


In [30]:
train_iterator, val_iterator  = data.BucketIterator.splits((train_data_torchtext, val_data_torchtext),
                                                           batch_size = 64, shuffle = True, sort=False)
test_iterator = data.BucketIterator(test_data_torchtext, batch_size=len(test_data_torchtext), shuffle=False)

In [31]:
vocab_size = len(TEXT.vocab)
n_classes = 2

## 3. 모델 만들기

In [32]:
class LSTM(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p = 0.2):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, bidirectional=True)
        self.out = nn.Linear(hidden_dim*2, n_classes, bias=True)
        
    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        h_t = x[:, -1, :]
        self.dropout(h_t)
        logit = self.out(h_t)
        return logit 

In [33]:
model = LSTM(3, 300, vocab_size, 300, n_classes).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())

In [34]:
sample = torch.LongTensor(5, 4).random_(0, 10).to(DEVICE)
sample_out = model(sample)
print(sample_out.shape)

torch.Size([5, 2])


In [35]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [36]:
def evaluate(model, val_iterator):
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iterator:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        
    size = len(val_iterator.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

## 4. 사전 훈련된 워드 임베딩 사용하기 

In [37]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([2802, 300])


In [38]:
model.embed.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.0748, -0.5569,  0.2973,  ...,  0.1697, -0.4709, -0.2061],
        [ 0.0009,  0.3403,  0.1606,  ...,  0.5594, -0.1802, -0.6959],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

## 5. 모델 훈련하기 

In [None]:
best_val_loss = None
n_epochs = 100
for epoch in range(n_epochs + 1):
    train(model, optimizer, train_iterator)
    val_loss, val_accuracy = evaluate(model, val_iterator)
    
    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (epoch, val_loss, val_accuracy))
    
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("BiLSTM_glove300"):
            os.makedirs("BiLSTM_glove300")
        torch.save(model.state_dict(), "./BiLSTM_glove300/textclassificator.pt")
        best_val_loss = val_loss

## test data에 모델 적용하기

In [64]:
model.load_state_dict(torch.load("./BiLSTM_glove300/textclassificator.pt"))

<All keys matched successfully>

In [65]:
def test_result(model, test_iterator):
    model.eval()
    total_result = []
    for batch in test_iterator:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        logit = model(x)
        result = logit.max(1)[1]
        total_result += result
        
        
    return total_result

In [66]:
fin_result = test_result(model, test_iterator)
print(len(fin_result))
print(fin_result[:10])

3263
[tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0')]


In [68]:
final_result_csv = pd.DataFrame(fin_result, columns=["target"])
final_result_csv["target"] = final_result_csv["target"].apply(lambda x : x.item())
print(final_result_csv[:10])

   target
0       1
1       1
2       1
3       1
4       1
5       1
6       0
7       0
8       0
9       0


In [69]:
test = test.drop(["keyword", "location", "text"], axis="columns")
test.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [70]:
final = pd.concat([test, final_result_csv], axis = 1)
final.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [71]:
final.to_csv("./result/result_BiLSTM_glove300.csv", index=False)