In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [2]:
#DATA_PATH = "/content/drive/MyDrive/04_nlp/data/"
DATA_PATH=os.getcwd()

SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device,DATA_PATH

('cuda', 'c:\\NLP_Papers_Review\\04_NLP')

In [3]:
df = pd.read_csv(f"{DATA_PATH}/data/imdb/imdb_dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


- 정답데이터

In [4]:
target = (df["sentiment"] == "positive").astype(int).to_numpy().reshape(-1,1)
target.shape

(50000, 1)

- 단어문서행렬

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cnt_vec = CountVectorizer(stop_words="english", max_features=5000)
cnt_vec.fit(df["review"])

In [6]:
tdm = cnt_vec.transform(df["review"]).toarray()
tdm.shape

(50000, 5000)

# LDA

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(10, max_iter=10, random_state=SEED)

In [8]:
# lda_model.fit(tdm) # 엄청 오래걸림, 학습시 성능도 SVD 와 크게 차이가 없음
lda_model.fit(tdm)

In [9]:
import joblib
joblib.dump(lda_model, r'C:\NLP_Papers_Review\04_NLP\data\imdblda_model.pkl')

['C:\\NLP_Papers_Review\\04_NLP\\data\\imdblda_model.pkl']

# LSA

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_arr = scaler.fit_transform(tdm)

x_arr,x_arr.shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (50000, 5000))

In [11]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(1000, random_state=SEED)
x_train = lsa_model.fit_transform(x_arr)

x_train,x_train.shape

(array([[ 0.43501886, -0.05073099,  0.08929349, ..., -0.01275495,
          0.0365396 , -0.0677514 ],
        [ 0.25385959,  0.11886721, -0.05547474, ..., -0.04536412,
         -0.01319855, -0.00828486],
        [ 0.24155641, -0.02477861, -0.07607653, ..., -0.00273465,
          0.07295558,  0.02785948],
        ...,
        [ 0.23247196, -0.03561189,  0.0335833 , ..., -0.00274768,
          0.01791362,  0.00702074],
        [ 0.21583036,  0.11399629, -0.01325272, ...,  0.01932653,
         -0.01571707, -0.00824943],
        [ 0.3034225 , -0.18358744, -0.08325052, ...,  0.0070974 ,
          0.00333978,  0.00653294]]),
 (50000, 1000))

- 머신러닝 모델에 학습 시켜 교차검증 점수 확인해보기

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
cv = KFold(5, shuffle=True, random_state=SEED)

In [13]:
model = LogisticRegression(random_state=SEED)
scores = cross_val_score(model,x_train,target,cv=cv, scoring="accuracy", n_jobs=-1)
scores.mean()

0.8779

In [14]:
new=np.concatenate([x_train,x_arr], axis=1)
print(new.shape)


model = LogisticRegression(random_state=SEED)
scores = cross_val_score(model,new,target,cv=cv, scoring="accuracy", n_jobs=-1)
scores.mean()

(50000, 6000)


0.88386

In [15]:
np.unique(target)

array([0, 1])

In [16]:
target.shape

(50000, 1)

In [17]:
x_train.shape

(50000, 1000)

# 데이터셋 클래스

In [18]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])

        return item

# 모델 클래스

In [19]:
class Net(torch.nn.Module):
    def __init__(self, n_features, drop_rate = 0.5):
        super().__init__()
        self.seq = torch.nn.Sequential(
            torch.nn.Linear(n_features, 512),
            torch.nn.ELU(),
            torch.nn.Dropout(drop_rate),

            torch.nn.Linear(512, 256),
            torch.nn.ELU(),
            torch.nn.Dropout(drop_rate),

            torch.nn.Linear(256, 128),
            torch.nn.ELU(),
            torch.nn.Dropout(drop_rate),

            torch.nn.Linear(128, 64),
            torch.nn.ELU(),
            torch.nn.Dropout(drop_rate),

            torch.nn.Linear(64, 32),
            torch.nn.ELU(),
            torch.nn.Dropout(drop_rate),

            torch.nn.Linear(32, 1),
        )

    def forward(self, x):
        return self.seq(x)

# 학습 loop 함수

In [20]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train() # 학습 모드
    for batch in dataloader:
        pred = model( batch["x"].to(device) )
        loss = loss_fn( pred, batch["y"].to(device) )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

# 검증 또는 테스트 loop 함수

In [21]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    epoch_loss = 0
    pred_list = []
    act_func = torch.nn.Sigmoid()
    model.eval() # 평가 모드
    for batch in dataloader:
        pred = model( batch["x"].to(device) )
        if batch.get("y") is not None:
            loss = loss_fn( pred, batch["y"].to(device) )
            epoch_loss += loss.item()

        pred = act_func(pred) # logit 값을 확률로 변환
        pred = pred.to("cpu").numpy() # cpu 이동후 ndarray 로변환
        pred_list.append(pred)

    epoch_loss /= len(dataloader)
    pred = np.concatenate(pred_list)
    return epoch_loss, pred

# 하이퍼 파라미터 정의

In [22]:
batch_size = 32 # 배치 사이즈
loss_fn = torch.nn.BCEWithLogitsLoss() # 손실 객체
epochs = 100 # 최대 가능한 에폭수
n_splits = 5 # cv 에서 K 개수
n_features = x_train.shape[1] # 피처 개수

# 학습

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
DATA_PATH

'c:\\NLP_Papers_Review\\04_NLP'

In [25]:
is_holdout = False
reset_seeds(SEED) # 재현을 위해 시드고정
best_score_list = []
for i, (tri, vai) in enumerate( cv.split(x_train) ):
    # 학습용 데이터로더 객체
    train_dt = ReviewDataset(x_train[tri], target[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용 데이터로더 객체
    valid_dt = ReviewDataset(x_train[vai], target[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    # 모델 객체와 옵티마이저 객체 생성
    model = Net(n_features).to(device)
    optimizer = torch.optim.Adam( model.parameters() )

    best_score = 0 # 현재 최고 점수
    patience = 0 # 조기 종료 조건을 주기 위한 변수
    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

        pred = (pred > 0.5).astype(int)
        score = accuracy_score(target[vai], pred)

        #print(train_loss, valid_loss, score)
        
        if score > best_score:
            best_score = score # 최고 점수 업데이트
            patience = 0
            torch.save(model.state_dict(), f"{DATA_PATH}/data/imdb/"+f"model_svd_{i}.pth") # 최고 점수 모델 가중치 저장

        patience += 1
        if patience == 5:
            break

    print(f"{i}번째 폴드 최고 ACC: {best_score}")
    best_score_list.append(best_score)
    if is_holdout:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

0번째 폴드 최고 ACC: 0.8764


  0%|          | 0/100 [00:00<?, ?it/s]

1번째 폴드 최고 ACC: 0.8768


  0%|          | 0/100 [00:00<?, ?it/s]

2번째 폴드 최고 ACC: 0.8766


  0%|          | 0/100 [00:00<?, ?it/s]

3번째 폴드 최고 ACC: 0.8785


  0%|          | 0/100 [00:00<?, ?it/s]

4번째 폴드 최고 ACC: 0.8744


In [None]:
np.mean(best_score_list)

0.8754000000000002