In [29]:
import pandas as pd
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm
import os
import joblib

In [20]:
df = pd.read_csv('/kaggle/input/diplomacy/train_df.csv')

df.head()

Unnamed: 0,messages,sender_labels,receiver_labels,speakers,receivers,absolute_message_index,relative_message_index,seasons,years,game_score,game_score_delta,players,game_id,politeness,negative_sentiment,neutral_sentiment,positive_sentiment,vader_score
0,"Tsk tsk, I told you I was right",False,NOANNOTATION,france,germany,492,117,Winter,1909,7,4,"germany,france",6,3.185824,0.0,1.0,0.0,0.0
1,"Yeah, something tells me that player may be a ...",True,True,england,france,63,8,Spring,1901,3,0,"england,france",1,3.535946,0.0,0.8,0.2,0.926
2,Regardless of which way it falls in the end.,True,True,england,germany,1477,279,Spring,1906,5,0,"germany,england",1,3.207375,0.0,1.0,0.0,0.0
3,Austria is going to hedgehog though so I doubt...,False,True,italy,russia,133,9,Spring,1901,3,-1,"italy,russia",7,3.171482,0.0,1.0,0.0,-0.5009
4,I talked to Austria,True,True,germany,england,1363,94,Fall,1902,5,0,"germany,england",2,3.239397,0.0,1.0,0.0,0.0


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [22]:
train_df = pd.read_csv("/kaggle/input/diplomacy/train_df.csv")
val_df = pd.read_csv("/kaggle/input/diplomacy/val_df.csv")
test_df = pd.read_csv("/kaggle/input/diplomacy/test_df.csv")


In [23]:

x_train = train_df["messages"].tolist()
y_train = train_df["sender_labels"].astype(int).tolist()

x_val = val_df["messages"].tolist()
y_val = val_df["sender_labels"].astype(int).tolist()

x_test = test_df["messages"].tolist()
y_test = test_df["sender_labels"].astype(int).tolist()


In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [25]:
def get_bert_embeddings(texts, tokenizer, model, max_len=128):
    embeddings = []
    for text in tqdm(texts):
        tokens = tokenizer(text, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
        tokens = {k: v.to(device) for k, v in tokens.items()}
        with torch.no_grad():
            output = model(**tokens)
        cls_embedding = output.last_hidden_state[:, 0, :]  # [CLS] token
        embeddings.append(cls_embedding.squeeze().cpu().numpy())
    return torch.tensor(embeddings).numpy()

In [26]:
x_train_vec = get_bert_embeddings(x_train, tokenizer, bert_model)
x_val_vec = get_bert_embeddings(x_val, tokenizer, bert_model)
x_test_vec = get_bert_embeddings(x_test, tokenizer, bert_model)

100%|██████████| 12102/12102 [02:05<00:00, 96.71it/s]
  return torch.tensor(embeddings).numpy()
100%|██████████| 1729/1729 [00:17<00:00, 96.42it/s]
100%|██████████| 3458/3458 [00:36<00:00, 94.90it/s]


In [27]:
clf = LogisticRegression(max_iter=1000)
clf.fit(x_train_vec, y_train)


In [30]:
os.makedirs("saved_models", exist_ok=True)
joblib.dump(clf, "saved_models/logistic_regression_bert.joblib")
joblib.dump((x_train_vec, y_train), "saved_models/train_embeddings.joblib")
joblib.dump((x_val_vec, y_val), "saved_models/val_embeddings.joblib")
joblib.dump((x_test_vec, y_test), "saved_models/test_embeddings.joblib")

['saved_models/test_embeddings.joblib']

In [31]:
val_preds = clf.predict(x_val_vec)
test_preds = clf.predict(x_test_vec)

print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("\nClassification Report:\n", classification_report(y_test, test_preds))


Validation Accuracy: 0.9421631000578369
Test Accuracy: 0.9517061885482938

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.05      0.09       162
           1       0.96      1.00      0.98      3296

    accuracy                           0.95      3458
   macro avg       0.67      0.52      0.53      3458
weighted avg       0.93      0.95      0.93      3458



In [None]:
# from transformers import AutoTokenizer

# # Use BERT base or any variant you prefer
# model_name = "bert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# def tokenize(example):
#     return tokenizer(example["messages"], truncation=True, padding="max_length", max_length=128)

# train_dataset = train_dataset.map(tokenize, batched=True)
# val_dataset = val_dataset.map(tokenize, batched=True)
# test_dataset = test_dataset.map(tokenize, batched=True)

# # Set format for PyTorch
# train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'sender_labels'])
# val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'sender_labels'])
# test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'sender_labels'])


In [None]:
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = model.encode(df['messages'].tolist(), convert_to_tensor=True)


In [None]:
# from transformers import AutoTokenizer, AutoModel

# tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# model = AutoModel.from_pretrained("roberta-base")


In [None]:
# import gensim.downloader as api

# model = api.load("glove-wiki-gigaword-100")
# embedding = model['hello']  # Get word vector
