In [1]:
import torch

embeddings = torch.load('embeddings.pt')
print(len(embeddings))
print(len(embeddings[0]))
print(type(embeddings[0][0]))

7100
17
<class 'torch.Tensor'>


In [2]:
import pandas as pd

df_train = pd.read_csv('ECHR_Dataset/train.csv')
df_train = df_train[['text','label']]
df_train.head(10)

Unnamed: 0,text,label
0,7. On 28 September 1994 the applicant's husban...,0
1,8. The applicant was born in 1974 and lives in...,0
2,"5. The first applicant, Mr Ivan Dvořáček, was ...",1
3,4. The applicant was born in 1959 and lives in...,1
4,6. The applicant was born in 1946.7. On 14 Aug...,1
5,6. The applicant was born in 1939 and lives in...,1
6,4. The applicant lives in Vrlika.5. He is a se...,0
7,8. Mr lives at Höchst in Vorarlberg.9. On 7 No...,1
8,5. The applicant was born in 1961 and is curre...,1
9,"The applicant, Curtis Francis Warren, is a Uni...",0


In [3]:
import numpy as np
avg_embeddings = {}
for i in range(len(embeddings)):
    avg_embeddings[i] = torch.mean(torch.stack(embeddings[i], dim=0).reshape(-1,768), dim=0)

In [4]:
#cat_embeddings = {}
#for i in range(len(embeddings)):
#    cat_embeddings[i] = torch.cat(embeddings[i], dim=-1)

In [5]:
# check all the avgembeddings have the same shape
print(avg_embeddings[0].shape)
for i in range(len(avg_embeddings)):
    if avg_embeddings[i].shape != avg_embeddings[0].shape:
        print(avg_embeddings[i].shape)
        
        

torch.Size([768])


In [6]:
df_train["embedding"] = [ avg_embeddings[i] for i in range(len(avg_embeddings)) ]
df_train.head(10)

Unnamed: 0,text,label,embedding
0,7. On 28 September 1994 the applicant's husban...,0,"[tensor(-0.3084), tensor(-0.2308), tensor(-0.3..."
1,8. The applicant was born in 1974 and lives in...,0,"[tensor(-0.6178), tensor(0.0757), tensor(0.229..."
2,"5. The first applicant, Mr Ivan Dvořáček, was ...",1,"[tensor(-0.4491), tensor(0.0542), tensor(0.306..."
3,4. The applicant was born in 1959 and lives in...,1,"[tensor(-0.8093), tensor(-0.5928), tensor(-0.6..."
4,6. The applicant was born in 1946.7. On 14 Aug...,1,"[tensor(-0.4574), tensor(-0.0582), tensor(0.27..."
5,6. The applicant was born in 1939 and lives in...,1,"[tensor(-0.4897), tensor(-0.2227), tensor(-0.6..."
6,4. The applicant lives in Vrlika.5. He is a se...,0,"[tensor(-0.3480), tensor(-0.0416), tensor(-0.0..."
7,8. Mr lives at Höchst in Vorarlberg.9. On 7 No...,1,"[tensor(-0.5193), tensor(-0.1316), tensor(-0.0..."
8,5. The applicant was born in 1961 and is curre...,1,"[tensor(-0.7300), tensor(-0.3040), tensor(0.02..."
9,"The applicant, Curtis Francis Warren, is a Uni...",0,"[tensor(-0.5262), tensor(-0.3986), tensor(-0.5..."


In [7]:
type(df_train['embedding'])

pandas.core.series.Series

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np
from tqdm import tqdm

# Define the neural network model
class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(768, 16),
            nn.ReLU(),
            #nn.Dropout(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear_relu_stack(x)
        return x

# Assuming df_train['embedding'] contains embeddings as NumPy arrays and df_train['label'] contains corresponding labels

# Convert pandas series to numpy arrays
X = np.vstack(df_train['embedding'])
y = df_train['label'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=23)

# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Move tensors to GPU if available
device = torch.device('mps')
X_train_tensor = X_train_tensor.to(device)
X_val_tensor = X_val_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)
y_val_tensor = y_val_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# Define hyperparameters
input_size = X_train.shape[1]
hidden_size1 = 128
hidden_size2 = 64
num_classes = len(np.unique(y_train))

# Initialize the model
model = NN().to(device)
print(model)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)

# Training the model
num_epochs = 20
batch_size = 32

best_val_accuracy = 0.0
for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    loop = tqdm(enumerate(range(0, len(X_train_tensor), batch_size)), total=len(X_train_tensor)//batch_size)
    for i, batch_start in loop:
        inputs = X_train_tensor[batch_start:batch_start+batch_size]
        targets = y_train_tensor[batch_start:batch_start+batch_size]

        # Forward pass
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluate on validation set
    model.eval()
    with torch.no_grad():
        outputs = model(X_val_tensor)
        loss = criterion(outputs, y_val_tensor)
        predicted = (outputs > 0.5).long()  # Convert probabilities to labels
        val_accuracy = torch.sum(predicted == y_val_tensor).item() / len(y_val_tensor)
    
    print(f'Validation Accuracy after epoch {epoch+1}: {val_accuracy:.4f}')
    print(f'Validation Loss after epoch {epoch+1}: {loss.item():.4f}')

    # Save the model with the best validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model_state = model.state_dict()

# Load the best model state
model.load_state_dict(best_model_state)

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predicted = (outputs > 0.5).long()  # Convert probabilities to labels

# Print classification report
print(classification_report(y_test_tensor.cpu().numpy(), predicted.cpu().numpy()))


NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=768, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)


  0%|          | 0/142 [00:00<?, ?it/s]

: 

In [None]:
import matplotlib.pyplot as plt
# Compute the class distribution
class_distribution = df_train['label'].value_counts()

# Print the class distribution
print("Class Distribution:")
print(class_distribution)

# Plot the class distribution
class_distribution.plot(kind='bar', title='Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()