In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import urllib.request
import numpy as np
import re


# **PYTORCH 사용**

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F


**함수 preprocess_text** <br>
입력 데이터 전처리 작업:
한글, 숫자, 영어를 제외한 글자는 모두 '_'로 대치

In [4]:
def preprocess_text(text):
   
    # input  text ='이 SF영화 재미^^;; 없는데.....왜 7점이지???'
    # output text ='$이_SF영화_재미_없는데_왜_7점이지$'
    
    non_alpha_numeric_hangul = re.compile('[^0-9a-zA-Z\u3131-\u3163\uac00-\ud7a3]')
    SS = '$'  # Sentence Separator
    
    t = non_alpha_numeric_hangul.sub('_', text)
    text = re.sub(r'_+', '_', t)
    return SS+text+SS

**함수 prepare_data_file** <br>

입력 파일 형식에 맞추어 data와 target 값 읽어오기

---

**id	document	label**<br>
6270596	굳 ㅋ	1 <br>
9274899	GDNTOPCLASSINTHECLUB	0 <br>
8544678	뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아	0<br>
6825595	지루하지는 않은데 완전 막장임... 돈주고 보기에는....	0

In [5]:
def prepare_data_file(FILE_PATH):
    
    # file load from URL
    with urllib.request.urlopen(FILE_PATH) as f:
        lines = f.read().decode('utf-8').split('\n')

    data, target = [], []
    for l in lines[1:]:   # 첫번째 줄은 skip
        try:
            _, text, label = l.strip().split('\t')   # ID\treview\tlabel 
        except ValueError:
            pass
        text = text.strip()
        if text == '': continue
        data.append(preprocess_text(text))
        target.append(int(label))
            
    return data, target

**함수 extract_features** <br>
Bigram feature 추출하여 feature_dict로 만들어 return <br>
MAX_FEATURES: 사용할 feature 갯수
```
$이_SF영화_재미_없는데_왜_7점이지$ 
($이) (이_) (_S) (SF) (F영) (영화) (화_) (_재) (재미) .. (이지) (지$)
```


In [6]:
def extract_features(data, MAX_FEATURES):
    
    FEATURES = dict()
      
    for line in data:
        uni_list = list(line)
        bi_list = [''.join(uni_list[z:z+2]) for z in range(0, len(uni_list)-1)]
        for bigram in bi_list:
            if bigram in FEATURES:
                FEATURES[bigram] += 1
            else:
                FEATURES[bigram] = 1
    
    features_list = [(x, f) for (x, f) in FEATURES.items()]
    features_list.sort(reverse=True, key=lambda z:z[1])
    features_dict = dict()
    for (idx, (x, y)) in enumerate(features_list[:MAX_FEATURES]):
        features_dict[x] = idx

    return features_dict

**함수 make_feature_vevtor** <br>
입력 문장을 고정된 크기의 Feature Vector로 변환<br>

```
$이_SF영화_재미_없는데_왜_7점이지$ 
($이) (이_) (_S) (SF) (F영) (영화) (화_) (_재) (재미) .. (이지) (지$)
[1, 0, 0, 1, 0, 0, ...., 1, 0, 0]
```


In [7]:
def make_feature_vector(feature_set, data, target):

    fv_base = [0 for _ in range(0, len(feature_set))]
    feature_list = []
    for (x, label) in zip(data, target):
        uni_list = list(x)
        fv = fv_base[:]
        bi_list = [''.join(uni_list[z:z+2]) for z in range(0, len(uni_list)-1)]
        for bigram in bi_list: 
            if bigram in feature_set:
                fv[feature_set[bigram]] = 1.0   # 여기 수정되었음 (integer-->float)
        feature_list.append(fv + [label])
    feature_list = np.array(feature_list)
    np.random.shuffle(feature_list)

    # pytorch의 TENSOR로 변환    
    feature_tensor = torch.tensor(feature_list[:, :-1], dtype=torch.float)
    target_tensor = torch.tensor(feature_list[:, -1], dtype=torch.long)    
    
    return feature_tensor, target_tensor

**학습데이터, 평가데이터 읽어오기**

In [8]:
# NAVER MOVIE REVIEW corpus로부터 데이터 loading
TRAIN_FILE = 'https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt'
TEST_FILE  = 'https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt'

print('prepare_data_file START...')
train_data, train_target = prepare_data_file(TRAIN_FILE)
test_data, test_target = prepare_data_file(TEST_FILE)
print('prepare_data_file END...')

prepare_data_file START...
prepare_data_file END...


**학습데이터로부터 MAX_FEATURES 개의 bigram feature 추출**

In [9]:
# Bigram Feature 추출
print('extract_features START...')
MAX_FEATURES = 1000  # 사용할 feature 개수
feature_set = extract_features(train_data, MAX_FEATURES)
with open('/content/drive/MyDrive/IntroAI/features.out', 'w', encoding='utf8') as fo:
    fo.write('\n'.join([x+'\t'+str(idx) for x, idx in feature_set.items()]))
print('extract_features END...')

extract_features START...
extract_features END...


**입력 파일을 고정된 크기의 feature vector로 변화**

In [10]:
# 입력 파일을 feature vector로 변환
print('make_feature_vector START...')
x_train, y_train = make_feature_vector(feature_set, train_data, train_target)
x_test,  y_test  = make_feature_vector(feature_set, test_data, test_target)
print('make_feature_vector END...')

make_feature_vector START...
make_feature_vector END...


# **MLP 딥러닝 모델 생성 및 학습**

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [12]:
# Network
class MLPNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(1000, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)


    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


model = MLPNet().to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
loss_fn = nn.CrossEntropyLoss()

In [13]:
batch_size = 32
total_batch = int(len(x_train) / batch_size)

for epoch in range(20):  # EPOCH 20
    
    # training
    for idx in range(total_batch):

        x = x_train[batch_size*idx:batch_size*(idx+1)]
        y = y_train[batch_size*idx:batch_size*(idx+1)]

        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = loss_fn(out, y)        
        loss.backward()
        optimizer.step()

        if idx % 1000 == 0:
            print('epoch: ', epoch, '  batch_idx: ', idx, loss.item())


epoch:  0   batch_idx:  0 0.6914579272270203
epoch:  0   batch_idx:  1000 0.4556080996990204
epoch:  0   batch_idx:  2000 0.455923467874527
epoch:  0   batch_idx:  3000 0.41243255138397217
epoch:  0   batch_idx:  4000 0.46265119314193726
epoch:  1   batch_idx:  0 0.2888001501560211
epoch:  1   batch_idx:  1000 0.38514095544815063
epoch:  1   batch_idx:  2000 0.42409127950668335
epoch:  1   batch_idx:  3000 0.38725563883781433
epoch:  1   batch_idx:  4000 0.40799957513809204
epoch:  2   batch_idx:  0 0.26871952414512634
epoch:  2   batch_idx:  1000 0.36208614706993103
epoch:  2   batch_idx:  2000 0.37289559841156006
epoch:  2   batch_idx:  3000 0.3314369022846222
epoch:  2   batch_idx:  4000 0.3696926534175873
epoch:  3   batch_idx:  0 0.2446535974740982
epoch:  3   batch_idx:  1000 0.3451700806617737
epoch:  3   batch_idx:  2000 0.30348825454711914
epoch:  3   batch_idx:  3000 0.29172563552856445
epoch:  3   batch_idx:  4000 0.28541404008865356
epoch:  4   batch_idx:  0 0.1968127936124

### **정확도 평가**

In [14]:
batch_size = 32
total_batch = len(x_test) // batch_size
rest = 1 if len(x_test) % batch_size else 0 

# testing
total_cnt = 0
correct_cnt = 0

for idx in range(total_batch + rest):

    if idx == total_batch: 
      b, e = batch_size * idx, len(x_test)
    else: 
      b, e = batch_size * idx, batch_size * (idx+1)
    x = x_test[b:e]
    y = y_test[b:e]

    x, y = x.to(device), y.to(device)
    out = model(x)
    _, pred_label = torch.max(out, 1)
    total_cnt += len(x)
    correct_cnt += (pred_label == y).sum().item()

print('total_count: ', total_cnt, '  accuracy: ', correct_cnt*1.0/total_cnt)


total_count:  49998   accuracy:  0.7904916196647865


### **수행해 보기**

In [16]:
# 수행해 보기
text = '시간낭비 이딴걸 왜 봄?'   
text = preprocess_text(text)
one_test, _ = make_feature_vector(feature_set, [text], [0])
one_test = one_test.to(device)
result = model(one_test)
_, pred_label = torch.max(result, 1)
print(text, '==> ', ['Negative', 'Positive'][pred_label])

$시간낭비_이딴걸_왜_봄_$ ==>  Negative
