In [None]:
import warnings
warnings.filterwarnings( 'ignore' )

import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses

from utils.utils import sentence_pairs_generation
from utils.sentence_transformer_config import Parameters

In [None]:
seed = 42

np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# When running on the CuDNN backend, two further options must be set
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Set a fixed value for the hash seed
os.environ["PYTHONHASHSEED"] = str(seed)
torch.set_float32_matmul_precision('medium')

# Get parameters
args = Parameters()

### Data

In [None]:
# Load model
df = pd.read_csv(args.dataset_path, sep='&')
# Split Train/Test datasets
df_train, df_test = train_test_split(df, test_size=args.test_size, shuffle=True, stratify=df['label'])


# Reset indices
df_train = df_train.reset_index().drop(['index'], axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)

testX = df_test['text'].values.tolist()
testY = df_test['label'].values.tolist()

In [None]:
number_training = int( df_train['label'].value_counts().min() * args.number_training_percentage )

# Equal samples per class training
df_train_sample = pd.concat([df_train[df_train['label']==0].sample(number_training), df_train[df_train['label']==1].sample(number_training)])
trainX = df_train_sample['text'].values.tolist()
trainY = df_train_sample['label'].values.tolist()

# Create positive & negative pairs
train_examples = [] 
for x in range(args.number_of_iterations):
  train_examples = sentence_pairs_generation(np.array(trainX), np.array(trainY), train_examples)

# Create training loader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=args.batch_size)

# Train Sentence-Transformer

In [None]:
# Create Sentence-Transformer
model = SentenceTransformer(args.model_name)

# Set loss
if args.loss_function == 'CosineSimilarityLoss':
    train_loss = losses.CosineSimilarityLoss(model=model)
elif args.loss_function == 'ContrastiveLoss':
    train_loss = losses.ContrastiveLoss(model=model)
elif args.loss_function == 'OnlineContrastiveLoss':
    train_loss = losses.OnlineContrastiveLoss(model=model)
elif args.loss_function == 'SoftmaxLoss':
    train_loss = losses.SoftmaxLoss(model=model)
else:
    raise Exception('Not known loss')



In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], 
          epochs=args.epochs, 
          scheduler='WarmupLinear',
          warmup_steps=10, 
          optimizer_class = torch.optim.AdamW,
          optimizer_params = {'lr': args.learning_rate},
          weight_decay = 0.001,
          output_path ='sentence_transformer',
          save_best_model =True,
          show_progress_bar=True)

### Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier

trainX_embeddings = model.encode(trainX)
testX_embeddings = model.encode(testX)

# Setup ML model
RF =  RandomForestClassifier(n_jobs=-1, random_state=args.seed)
RF.fit(trainX_embeddings, trainY)
pred = RF.predict(testX_embeddings)


from sklearn.metrics import classification_report
print(classification_report(testY, pred))

In [None]:
import umap

# Setup dimensionality reduction model
umap_model = umap.UMAP(n_neighbors=15, 
                       n_components=2, 
                       metric='cosine',
                       random_state=42)


trainX_umap_embeddings = umap_model.fit_transform(trainX_embeddings)
testX_umap_embeddings = umap_model.transform(testX_embeddings)

In [None]:
plt.figure(figsize=(10, 3))

plt.subplot(121)
for i, t in enumerate(set(np.array(trainY))):
    idx = np.array(trainY) == t
    plt.scatter(trainX_umap_embeddings[idx, 0], trainX_umap_embeddings[idx, 1], label=t)  
plt.title('Training instances')

plt.subplot(122)
for i, t in enumerate(set(np.array(testY))):
    idx = np.array(testY) == t
    plt.scatter(testX_umap_embeddings[idx, 0], testX_umap_embeddings[idx, 1], label=t) 
plt.title('Testing instances')