In [1]:
import pandas as pd
import numpy as np
import re
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader


import random
import os

from sklearn.metrics import f1_score,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split


from model import DNN
from dataloader import DNNDataset

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
params = {
    'EPOCHS':2,
    'LEARNING_RATE':1e-7,
    'BATCH_SIZE':128,
    'SEED':45,
    'MAX_FEATURES': 15000,
    'TRAIN_DATA_PATH': '../../make_data/preprocessed_data/article_train.csv',
    'VALID_DATA_PATH': '../../make_data/preprocessed_data/article_valid.csv',
    'TEST_DATA_PATH': '../../make_data/preprocessed_data/article_test.csv',
    'SAVE_PATH':'../Models/TF-IDF_Article.pt'
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(params['SEED']) # Seed 고정

In [4]:
train_data = pd.read_csv(params['TRAIN_DATA_PATH'],index_col=[0])
valid_data = pd.read_csv(params['VALID_DATA_PATH'],index_col=[0])
test_data = pd.read_csv(params['TEST_DATA_PATH'],index_col=[0])

train_data = train_data.sample(frac=1).reset_index(drop=True)
valid_data = valid_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

In [5]:
train_data.reset_index(drop=True,inplace=True)
valid_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [6]:
print(len(train_data))
print(len(train_data.drop_duplicates(['sent_jo'])))

print(len(valid_data))
print(len(valid_data.drop_duplicates(['sent_jo'])))

print(len(test_data))
print(len(test_data.drop_duplicates(['sent_jo'])))

print(train_data['label'].value_counts()/len(train_data))
print(valid_data['label'].value_counts()/len(valid_data))
print(test_data['label'].value_counts()/len(test_data))

64591
64591
11851
11851
15123
15123
0    0.91889
1    0.08111
Name: label, dtype: float64
0    0.944562
1    0.055438
Name: label, dtype: float64
0    0.956292
1    0.043708
Name: label, dtype: float64


In [7]:
tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, ngram_range=(1, 2), max_features=params['MAX_FEATURES'], binary=False)
tfidf.fit(train_data['sent_jo'])

TfidfVectorizer(max_features=15000, ngram_range=(1, 2), sublinear_tf=True)

In [8]:
train_dataset = DNNDataset(train_data,tfidf)
train_dataloader = DataLoader(train_dataset,batch_size=params['BATCH_SIZE'])

valid_dataset = DNNDataset(valid_data,tfidf)
valid_dataloader = DataLoader(valid_dataset,batch_size=params['BATCH_SIZE'])

test_dataset = DNNDataset(test_data,tfidf)
test_dataloader = DataLoader(test_dataset,batch_size=params['BATCH_SIZE'])

In [9]:
def train(model, train_loader,valid_loader, device):

    model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(),lr=1e-3)
    
    best_score = 0
    best_model = "None"
    for epoch_num in range(1,params["EPOCHS"]+1):

        model.train()
        
        train_loss = []
        for data in tqdm(train_loader):
            
            train_texts = data['text']
            train_labels = data['label']
            train_label = train_labels.to(device)
            train_text = train_texts.to(device)
            
            optimizer.zero_grad()

            output = model(train_text)  
            
            output = output.reshape(-1)
            batch_loss = criterion(output.to(torch.float32), train_label.to(torch.float32)) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()

        val_loss, val_acc, val_f1 = validation(model, criterion, valid_loader, device)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] \
              Val Loss : [{np.mean(val_loss) :.5f}] Val Accuracy Score : [{val_acc:.5f}] Val F1 Score : [{val_f1:.5f}]')
        
        val_score = val_f1
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model                         

In [10]:
def competition_metric(true, pred):
    return accuracy_score(true,pred),f1_score(true, pred, average="macro")

def validation(model,criterion,valid_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for data in tqdm(valid_loader):
            
            valid_texts = data['text']
            valid_labels = data['label']
          
            valid_label = valid_labels.to(device)
            valid_text = valid_texts.to(device)

            output = model(valid_text)    
            
            output = output.reshape(-1)
            batch_loss = criterion(output.to(torch.float32), valid_label.to(torch.float32)) 
            val_loss.append(batch_loss.item())
            
            output[output>0.5] = 1
            output[output<=0.5] = 0
            model_preds += output.detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
        val_acc, val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_acc, val_f1    

In [11]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    with torch.no_grad():
        for data in tqdm(test_loader):
            
            test_texts = data['text']
            test_text = test_texts.to(device)

            output = model(test_text)    
            
            output[output>0.5] = 1
            output[output<=0.5] = 0
            test_predict += output.detach().cpu().numpy().tolist()
    test_predict = torch.Tensor(test_predict)
    test_predict = test_predict.reshape(-1)
    print('Done.')
    return test_predict

In [12]:
model = DNN()
model.eval()

infer_model = train(model, train_dataloader, valid_dataloader, device)

100%|██████████| 505/505 [00:12<00:00, 41.13it/s]
100%|██████████| 93/93 [00:01<00:00, 75.11it/s]


Epoch [1], Train Loss : [0.17303]               Val Loss : [0.11328] Val Accuracy Score : [0.96692] Val F1 Score : [0.82205]


100%|██████████| 505/505 [00:11<00:00, 42.91it/s]
100%|██████████| 93/93 [00:01<00:00, 75.72it/s]

Epoch [2], Train Loss : [0.10872]               Val Loss : [0.12044] Val Accuracy Score : [0.96253] Val F1 Score : [0.81747]





In [13]:
test_labels = test_data['label']
test_preds = inference(infer_model,test_dataloader,device)

print(competition_metric(test_labels,test_preds))

print(confusion_matrix(test_labels,test_preds))

100%|██████████| 119/119 [00:01<00:00, 75.62it/s]

Done.
(0.9611188256298353, 0.779508223061707)
[[14130   332]
 [  256   405]]





In [14]:
labels = train_data['label']
preds = inference(infer_model,train_dataloader,device)

print(competition_metric(labels,preds))

print(confusion_matrix(labels,preds))

100%|██████████| 505/505 [00:06<00:00, 73.06it/s]

Done.
(0.9682927962099983, 0.8845489174446675)
[[58777   575]
 [ 1473  3766]]





In [15]:
labels = valid_data['label']
preds = inference(infer_model,valid_dataloader,device)

print(competition_metric(labels,preds))

print(confusion_matrix(labels,preds))

100%|██████████| 93/93 [00:01<00:00, 74.99it/s]

Done.
(0.9625348071892668, 0.8174680549218898)
[[10986   208]
 [  236   421]]





In [16]:
torch.save(infer_model.state_dict(),params['SAVE_PATH'])