In [1]:
!pip install -q gensim

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import joblib


import gensim.downloader as api
from tqdm import tqdm
import re
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
df = pd.read_csv('/kaggle/input/diplomacy/train_df.csv')

df.head()

In [None]:
train_df = pd.read_csv("/kaggle/input/diplomacy/train_df.csv")
val_df = pd.read_csv("/kaggle/input/diplomacy/val_df.csv")
test_df = pd.read_csv("/kaggle/input/diplomacy/test_df.csv")


In [None]:
X_train_text = train_df['messages'].fillna("")
y_train = train_df['sender_labels'].astype(int)

X_val_text = val_df['messages'].fillna("")
y_val = val_df['sender_labels'].astype(int)

X_test_text = test_df['messages'].fillna("")
y_test = test_df['sender_labels'].astype(int)

In [None]:
glove = api.load('glove-wiki-gigaword-100')

def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

def get_glove_embedding(text):
    tokens = tokenize(text)
    vectors = [glove[word] for word in tokens if word in glove]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(glove.vector_size)

In [None]:
X_train = np.array([get_glove_embedding(msg) for msg in tqdm(X_train_text)])
X_val = np.array([get_glove_embedding(msg) for msg in tqdm(X_val_text)])
X_test = np.array([get_glove_embedding(msg) for msg in tqdm(X_test_text)])

In [None]:
np.save("glove_X_train.npy", X_train)
np.save("glove_X_val.npy", X_val)
np.save("glove_X_test.npy", X_test)


In [15]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)
print("🔹 Logistic Regression:")
print(classification_report(y_val, y_pred, digits=3))

joblib.dump(logreg, "logreg_model.joblib")


🔹 Logistic Regression:
              precision    recall  f1-score   support

           0      0.000     0.000     0.000        99
           1      0.943     0.999     0.970      1630

    accuracy                          0.942      1729
   macro avg      0.471     0.500     0.485      1729
weighted avg      0.889     0.942     0.915      1729



['logreg_model.joblib']

In [16]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
print("🔹 Random Forest:")
print(classification_report(y_val, y_pred, digits=3))

# Save
joblib.dump(rf, "rf_model.joblib")


🔹 Random Forest:
              precision    recall  f1-score   support

           0      0.500     0.020     0.039        99
           1      0.944     0.999     0.970      1630

    accuracy                          0.943      1729
   macro avg      0.722     0.509     0.505      1729
weighted avg      0.918     0.943     0.917      1729



['rf_model.joblib']

In [17]:
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_val)
print("🔹 SVM:")
print(classification_report(y_val, y_pred, digits=3))

joblib.dump(svm, "svm_model.joblib")


🔹 SVM:
              precision    recall  f1-score   support

           0      0.000     0.000     0.000        99
           1      0.943     1.000     0.971      1630

    accuracy                          0.943      1729
   macro avg      0.471     0.500     0.485      1729
weighted avg      0.889     0.943     0.915      1729



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['svm_model.joblib']

In [18]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
train_loader = DataLoader(TensorDataset(X_train_torch, y_train_torch), batch_size=64, shuffle=True)

X_val_torch = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_torch = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)

mlp_torch = MLP(X_train.shape[1]).to(device)
optimizer = optim.Adam(mlp_torch.parameters(), lr=1e-3)
criterion = nn.BCELoss()

for epoch in range(10):
    mlp_torch.train()
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = mlp_torch(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


mlp_torch.eval()
with torch.no_grad():
    preds = (mlp_torch(X_val_torch) > 0.5).int().cpu().numpy()
    print("🔹 MLP (PyTorch):")
    print(classification_report(y_val, preds, digits=3))

torch.save(mlp_torch.state_dict(), "mlp_torch_model.pt")


Epoch 1: Loss = 0.0678
Epoch 2: Loss = 0.0529
Epoch 3: Loss = 0.5440
Epoch 4: Loss = 0.0546
Epoch 5: Loss = 0.0357
Epoch 6: Loss = 0.0691
Epoch 7: Loss = 0.0664
Epoch 8: Loss = 0.0341
Epoch 9: Loss = 0.0346
Epoch 10: Loss = 0.0449
🔹 MLP (PyTorch):
              precision    recall  f1-score   support

           0      0.000     0.000     0.000        99
           1      0.943     1.000     0.971      1630

    accuracy                          0.943      1729
   macro avg      0.471     0.500     0.485      1729
weighted avg      0.889     0.943     0.915      1729



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
