In [3]:
import pandas as pd 
import torch
import re
from collections import defaultdict
import pickle

In [4]:
train_df = pd.read_csv('../../english_movie_review_train.csv')
test_df = pd.read_csv("../../english_movie_review_test.csv")

train_df.shape, test_df.shape

((40000, 2), (10000, 2))

In [5]:
train_idx = train_df.drop_duplicates('X').index
test_idx = test_df.drop_duplicates('X').index

train_df = train_df.iloc[train_idx]
test_df = test_df.iloc[test_idx]

train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

train_df.shape , test_df.shape

((39728, 2), (9986, 2))

In [6]:
train_df['X'] = train_df['X'].apply(lambda x : re.sub('[^A-Za-z0-9가-힣\s]+', '', x))
test_df['X'] = test_df['X'].apply(lambda x : re.sub('[^A-Za-z0-9가-힣\s]+', '', x))

In [7]:
train_df['token'] = train_df['X'].apply(lambda x : x.split(" "))
test_df['token'] = test_df['X'].apply(lambda x : x.split(" "))

In [8]:
train_df = train_df[train_df['token'].apply(lambda x : len(x) > 10)]
test_df = test_df[test_df['token'].apply(lambda x : len(x) > 10)]

train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [9]:
word_to_idx_dict = defaultdict(int)
word_to_idx_dict['<pad>'] = 0
word_to_idx_dict['<unk>'] = 1

for i in train_df['token'].tolist() : 
    for j in i :
        if j not in word_to_idx_dict.keys() : 
            word_to_idx_dict[j] += len(word_to_idx_dict) - 1 

In [10]:
train_df['num_token'] = train_df['token'].apply(lambda x : [word_to_idx_dict[i] for i in x])
test_df['num_token'] = test_df['token'].apply(lambda x : [word_to_idx_dict[i] for i in x])

In [11]:
train_df.to_csv("english_mr_train_df.csv",index=False)
test_df.to_csv("english_mr_test_df.csv",index=False)

with open('english_mr_word_to_idx_dict.pickle', 'wb') as f:
    pickle.dump(word_to_idx_dict, f)

In [12]:
train_df.num_token.apply(lambda x : len(x)).describe()

count    39720.000000
mean       231.129204
std        171.143503
min         11.000000
25%        126.000000
50%        173.000000
75%        280.000000
max       2278.000000
Name: num_token, dtype: float64

In [13]:
def padding(num_token) : 
    
    if len(num_token) < 100 : 
        num_token += [0] * (100-len(num_token))
    elif len(num_token) > 100 :
        num_token = num_token[:100]
    else : pass
    
    return num_token

In [14]:
train_df['num_token'] = train_df['num_token'].apply(lambda x : padding(x))
test_df['num_token'] = test_df['num_token'].apply(lambda x : padding(x))

In [15]:
train_X = torch.tensor(train_df['num_token'].tolist())
train_y = torch.tensor(train_df['y'].tolist())

test_X = torch.tensor(test_df['num_token'].tolist())
test_y = torch.tensor(test_df['y'].tolist())

In [16]:
torch.save(train_X, 'english_mv_train_X.pt')
torch.save(train_y, 'english_mv_train_y.pt')
torch.save(test_X, 'english_mv_test_X.pt')
torch.save(test_y, 'english_mv_test_y.pt')