In [1]:
import torch
from torch.utils import data
import numpy as np
import pandas as pd
import os
import re
import pickle

from collections import defaultdict

from konlpy.tag import Mecab
mecab = Mecab()

In [2]:
train_df = pd.read_csv('../train_df.csv')
test_df = pd.read_csv('../test_df.csv')

train_df.shape , test_df.shape

((149995, 2), (49997, 2))

In [3]:
train_idx = train_df.drop_duplicates('X').index
test_idx = test_df.drop_duplicates('X').index

train_df = train_df.iloc[train_idx]
test_df = test_df.iloc[test_idx]

train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

train_df.shape , test_df.shape

((146182, 2), (49157, 2))

In [4]:
train_df['X'] = train_df['X'].apply(lambda x : re.sub('[^A-Za-z0-9가-힣\s]+', '', x))
test_df['X'] = test_df['X'].apply(lambda x : re.sub('[^A-Za-z0-9가-힣\s]+', '', x))

In [5]:
train_df['token'] = train_df['X'].apply(lambda x : mecab.morphs(x))
test_df['token'] = test_df['X'].apply(lambda x : mecab.morphs(x))

In [6]:
train_df = train_df[train_df['token'].apply(lambda x : len(x) > 5)]
test_df = test_df[test_df['token'].apply(lambda x : len(x) > 5)]

train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [7]:
word_to_idx_dict = defaultdict(int)
word_to_idx_dict['<pad>'] = 0
word_to_idx_dict['<unk>'] = 1

for i in train_df['token'].tolist() : 
    for j in i :
        if j not in word_to_idx_dict.keys() : 
            word_to_idx_dict[j] += len(word_to_idx_dict) - 1 

In [8]:
train_df['num_token'] = train_df['token'].apply(lambda x : [word_to_idx_dict[i] for i in x])
test_df['num_token'] = test_df['token'].apply(lambda x : [word_to_idx_dict[i] for i in x])

In [9]:
train_df.to_csv("korean_mr_train_df.csv",index=False)
test_df.to_csv("korean_mr_test_df.csv",index=False)

with open('korean_mr_word_to_idx_dict.pickle', 'wb') as f:
    pickle.dump(word_to_idx_dict, f)

In [10]:
def padding(num_token) : 
    
    if len(num_token) < 20 : 
        num_token += [0] * (20-len(num_token))
    elif len(num_token) > 20 :
        num_token = num_token[:20]
    else : pass
    
    return num_token

In [11]:
train_df['num_token'] = train_df['num_token'].apply(lambda x : padding(x))
test_df['num_token'] = test_df['num_token'].apply(lambda x : padding(x))

In [12]:
train_X = torch.tensor(train_df['num_token'].tolist())
train_y = torch.tensor(train_df['y'].tolist())

test_X = torch.tensor(test_df['num_token'].tolist())
test_y = torch.tensor(test_df['y'].tolist())

In [13]:
torch.save(train_X, 'train_X.pt')
torch.save(train_y, 'train_y.pt')
torch.save(test_X, 'test_X.pt')
torch.save(test_y, 'test_y.pt')