In [None]:
!pip install transformers
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertTokenizerFast, TFBertForSequenceClassification, RobertaTokenizer, TFRobertaForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split


from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
traindata = pd.read_csv('/content/train.csv')
testdata = pd.read_csv('/content/test.csv')

print("훈련용 개수:", len(traindata))
print("테스트용 개수:", len(testdata))


훈련용 개수: 192000
테스트용 개수: 48000


In [None]:
# 중복 데이터, 결측값 제거

# text 열 중복 제거
traindata.drop_duplicates(subset=['text'], inplace=True)

# null 값 존재하는 행 제거
traindata = traindata.dropna(how='any')
testdata = testdata.dropna(how = 'any')
print('훈련 데이터 수:', len(traindata))
print('테스트 데이터 수:', len(testdata))


훈련 데이터 수: 186983
테스트 데이터 수: 48000


In [None]:
# 토크나이저 이용한 정수 인코딩
tokenizer = RobertaTokenizer.from_pretrained('textattack/roberta-base-MNLI')

X_train_list = traindata['text'].tolist()
X_test_list = testdata['text'].tolist()
y_train = traindata['sentiment'].tolist()

# 훈련 데이터를 훈련 세트와 검증 세트로 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(X_train_list, y_train, test_size=.2)
# 토크나이징과 데이터셋 준비
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# 토큰화된 첫 번째 결과 출력
print(train_encodings['input_ids'][0]) # 정수 인코딩 결과
print(train_encodings['attention_mask'][0]) # 어텐션 마스크


"""
# 소문자 변환 모델 토크나이저
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(tokenizer.tokenize())

model. = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)
"""

[0, 605, 3277, 7, 213, 192, 2014, 7416, 53, 24, 12905, 29, 45, 816, 4558, 583, 259, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

"\n# 소문자 변환 모델 토크나이저\ntokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\nprint(tokenizer.tokenize())\n\nmodel. = TFBertModel.from_pretrained('bert-base-uncased', from_pt=True)\n"

In [None]:
# 데이터셋 생성 및 모델 학습

# 데이터를 텐서플로우의 데이터셋 형태로 변환
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

model = TFRobertaForSequenceClassification.from_pretrained(
    "textattack/roberta-base-MNLI",
    num_labels=3,
    from_pt = True,
)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

test_encodings = tokenizer(X_test_list, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings)))


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [None]:
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)


early_stopping = EarlyStopping(
    monitor = 'val_accuracy',
    patience=3,
    restore_best_weights = True,
)

model.fit(
    train_dataset.shuffle(10000).batch(16),
    epochs=10,
    batch_size=16,
    validation_data = val_dataset.shuffle(10000).batch(16),
    callbacks = [early_stopping],
)

predictions = model.predict(test_dataset.batch(16))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# 예측 결과 처리 및 클래스에 매핑
predicted_classes = np.argmax(predictions.logits, axis=1)

# 제출 파일 생성
submission = pd.DataFrame({
    "id": testdata['id'],
    "sentiment": predicted_classes
})

# 데이터프레임을 CSV 파일로 저장
submission.to_csv('/content/roberta2.csv', index=False)