In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test_df = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')
sample_submission = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')

In [None]:
train_df

In [None]:
train_df['language'].value_counts()

In [None]:
!pip install transformers


In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

# 미리 학습된 BERT 모델 및 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)


In [None]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

# 레이블 추출
train_labels = train_df['label'].values

# 데이터 인코딩
def encode_sentences(data, tokenizer):
    input_ids, attention_masks = [], []
    for _, row in data.iterrows():
        encoded = tokenizer.encode_plus(row['premise'], row['hypothesis'], add_special_tokens=True,
                                        max_length=128, padding='max_length',
                                        return_attention_mask=True, return_tensors='tf',truncation=True)
        input_ids.append(encoded['input_ids'][0])
        attention_masks.append(encoded['attention_mask'][0])
    return np.array(input_ids), np.array(attention_masks)

# 인코딩된 데이터셋 생성
train_input_ids, train_attention_masks = encode_sentences(train_df, tokenizer)
test_input_ids, test_attention_masks = encode_sentences(test_df, tokenizer)

# 훈련 및 검증 데이터셋 분할
X_train_ids, X_val_ids, X_train_masks, X_val_masks, y_train, y_val = train_test_split(train_input_ids, train_attention_masks, train_labels, test_size=0.2, random_state=42)


In [None]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# 모델 컴파일
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 모델 학습
history = model.fit([X_train_ids, X_train_masks], 
                    y_train, 
                    validation_data=([X_val_ids, X_val_masks], y_val), 
                    batch_size=32, epochs=4)



In [None]:
# 테스트 데이터셋에 대한 예측값 추출
predictions = model.predict([test_input_ids, test_attention_masks])

# 예측 결과를 클래스로 변환 (0, 1, 2)
predicted_labels = np.argmax(predictions['logits'], axis=1)
predicted_labels
# 결과 출력 (테스트 데이터에 대한 레이블이 있는 경우)
#from sklearn.metrics import accuracy_score
#true_labels = ...  # 테스트 데이터에 대한 실제 레이블 (가정)
#print("Accuracy:", accuracy_score(true_labels, predicted_labels))


In [None]:
print(len(predictions[0]))
predictions

In [None]:
predicted_labels

In [None]:
sample= pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv')
sample

In [None]:
sample['prediction'].value_counts()

In [None]:
submission = pd.DataFrame({'id': test['id'], 'prediction': predicted_labels})
submission

In [None]:
submission.to_csv("submission.csv", index=False)