In [1]:
import pandas as pd
import numpy as np
import re
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

import random
import os
import time
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix
from sklearn.model_selection import train_test_split


from model import DNN
from dataloader import DNNDataset

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
params = {
    'EPOCHS':2,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':16,
    'SEED':45,
    'DATA_PATH': '../../make_data/preprocessed_data/good_bad_df.csv',
    'SAVE_PATH':'../Models/TF-IDF_Sentence.pt',
    'MAX_FEATURES':15000
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(params['SEED']) # Seed 고정

In [4]:
data = pd.read_csv(params['DATA_PATH'],index_col=[0])

data.loc[data['ad_label']==1,'ad_label']=0
data.loc[data['ad_label']==2,'ad_label']=1

train_data, test_data = train_test_split(data,test_size=0.2,random_state=params['SEED'],shuffle=True)
valid_data, test_data = train_test_split(test_data,test_size=0.5,random_state=params['SEED'],shuffle=True)

train_data = train_data.sample(frac=1).reset_index(drop=True)
valid_data = valid_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

train_data.reset_index(drop=True,inplace=True)
valid_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

In [5]:
print(len(train_data))
print(len(train_data.drop_duplicates(['summary'])))

print(len(valid_data))
print(len(valid_data.drop_duplicates(['summary'])))

print(len(test_data))
print(len(test_data.drop_duplicates(['summary'])))

print(train_data['ad_label'].value_counts()/len(train_data))
print(valid_data['ad_label'].value_counts()/len(valid_data))
print(test_data['ad_label'].value_counts()/len(test_data))

6867
6867
858
858
859
859
0    0.696665
1    0.303335
Name: ad_label, dtype: float64
0    0.715618
1    0.284382
Name: ad_label, dtype: float64
0    0.704307
1    0.295693
Name: ad_label, dtype: float64


In [6]:
tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, ngram_range=(1, 2), max_features=params['MAX_FEATURES'], binary=False)
tfidf.fit(train_data['summary'])

TfidfVectorizer(max_features=15000, ngram_range=(1, 2), sublinear_tf=True)

In [7]:
train_dataset = DNNDataset(train_data,tfidf)
train_dataloader = DataLoader(train_dataset,batch_size=params['BATCH_SIZE'])

valid_dataset = DNNDataset(valid_data,tfidf)
valid_dataloader = DataLoader(valid_dataset,batch_size=params['BATCH_SIZE'])

test_dataset = DNNDataset(test_data,tfidf)
test_dataloader = DataLoader(test_dataset,batch_size=params['BATCH_SIZE'])

In [8]:
def train(model, train_loader,valid_loader, device):

    model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(),lr=params['LEARNING_RATE'])
    
    best_score = 0
    best_model = "None"
    for epoch_num in range(1,params["EPOCHS"]+1):
        
        starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
        timings=np.zeros((len(train_loader),1))
        cnt = 0

        model.train()
        
        train_loss = []
        for data in tqdm(train_loader):
            
            starter.record()
            
            train_texts = data['text']
            train_labels = data['label']
            train_label = train_labels.to(device)
            train_text = train_texts.to(device)
            
            optimizer.zero_grad()

            output = model(train_text)  
            
            output = output.reshape(-1)
            batch_loss = criterion(output.to(torch.float32), train_label.to(torch.float32)) 
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            optimizer.step()
            
            ender.record()
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[cnt] = curr_time
            cnt += 1
        val_loss, val_acc, val_f1 = validation(model, criterion, valid_loader, device)
        
        time.sleep(1)
        print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] \
              Val Loss : [{np.mean(val_loss) :.5f}] Val Accuracy Score : [{val_acc:.5f}] Val F1 Score : [{val_f1:.5f}]')
        print('time per batch',np.mean(timings))
        val_score = val_f1
        if best_score < val_score:
            best_model = model
            best_score = val_score
        
    return best_model                         

In [9]:
def competition_metric(true, pred):
    return accuracy_score(true,pred),f1_score(true, pred, average="macro")

def validation(model,criterion,valid_loader, device):
    model.eval()

    val_loss = []
    model_preds = []
    true_labels = []  
    with torch.no_grad():
        for data in tqdm(valid_loader):
            
            valid_texts = data['text']
            valid_labels = data['label']
          
            valid_label = valid_labels.to(device)
            valid_text = valid_texts.to(device)

            output = model(valid_text)    
            
            output = output.reshape(-1)
            batch_loss = criterion(output.to(torch.float32), valid_label.to(torch.float32)) 
            val_loss.append(batch_loss.item())
            
            output[output>0.5] = 1
            output[output<=0.5] = 0
            model_preds += output.detach().cpu().numpy().tolist()
            true_labels += valid_label.detach().cpu().numpy().tolist()
        val_acc, val_f1 = competition_metric(true_labels, model_preds)
    return val_loss, val_acc, val_f1    

In [10]:
def inference(model, test_loader, device):
    
    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
    timings=np.zeros((len(test_loader),1))

    model.to(device)
    model.eval()
    
    test_predict = []
    with torch.no_grad():
        rep = 0
        for data in tqdm(test_loader):
            starter.record()
            
            test_texts = data['text']
            test_text = test_texts.to(device)

            output = model(test_text)    
            
            ender.record()
            # WAIT FOR GPU SYNC
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[rep] = curr_time
            rep += 1
            
            output[output>0.5] = 1
            output[output<=0.5] = 0
            test_predict += output.detach().cpu().numpy().tolist()
    test_predict = torch.Tensor(test_predict)
    test_predict = test_predict.reshape(-1)
    
    time.sleep(3)
    print('Done.')
    
    print('time per batch',np.mean(timings))
    return test_predict

In [11]:
model = DNN()
model.eval()

infer_model = train(model, train_dataloader, valid_dataloader, device)

100%|██████████| 430/430 [00:10<00:00, 41.52it/s]
100%|██████████| 54/54 [00:00<00:00, 425.83it/s]


Epoch [1], Train Loss : [0.15018]               Val Loss : [0.06831] Val Accuracy Score : [0.97669] Val F1 Score : [0.97122]
time per batch 22.59809587168139


100%|██████████| 430/430 [00:09<00:00, 43.80it/s]
100%|██████████| 54/54 [00:00<00:00, 431.54it/s]


Epoch [2], Train Loss : [0.02572]               Val Loss : [0.07732] Val Accuracy Score : [0.98019] Val F1 Score : [0.97557]
time per batch 21.385485804358193


In [12]:
test_labels = test_data['ad_label']
test_preds = inference(infer_model,test_dataloader,device)

print(competition_metric(test_labels,test_preds))

print(confusion_matrix(test_labels,test_preds))

100%|██████████| 54/54 [00:00<00:00, 428.36it/s]


Done.
time per batch 0.9949250353707207
(0.9837019790454016, 0.9803904303473825)
[[599   6]
 [  8 246]]


In [13]:
valid_labels = valid_data['ad_label']
valid_preds = inference(infer_model,valid_dataloader,device)

print(competition_metric(valid_labels,valid_preds))

print(confusion_matrix(valid_labels,valid_preds))

100%|██████████| 54/54 [00:00<00:00, 429.25it/s]


Done.
time per batch 1.0038151089791898
(0.9801864801864801, 0.9755692714832463)
[[607   7]
 [ 10 234]]


In [14]:
train_labels = train_data['ad_label']
train_preds = inference(infer_model,train_dataloader,device)

100%|██████████| 430/430 [00:01<00:00, 428.30it/s]


Done.
time per batch 1.003320855040883


In [15]:
print(competition_metric(train_labels,train_preds))

print(confusion_matrix(train_labels,train_preds))

(0.9989806320081549, 0.998794226283158)
[[4780    4]
 [   3 2080]]


In [16]:
torch.save(infer_model.state_dict(),params['SAVE_PATH'])

In [17]:
infer_model

DNN(
  (fc1): Linear(in_features=15000, out_features=10000, bias=True)
  (fc2): Linear(in_features=10000, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=1, bias=True)
  (dropout5): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)

In [18]:
print("infermodel : ", sum(p.numel() for p in infer_model.parameters() if p.requires_grad))

infermodel :  155196305
