In [None]:
!pip install transformers
!pip install simpletransformers
!pip install scikit-learn

!echo "--- Disk ---"
!df -h
!echo ""
!echo "--- CPU ---"
!cat /proc/cpuinfo
!echo ""
!echo "--- Memory ---"
!cat /proc/meminfo
!echo ""
!echo "--- GPU ---"
!nvidia-smi -L

# Mount Google Drive with input data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
import pandas as pd
import random as rand

# load data
path = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/corsample.txt'
data = pd.read_csv(path, sep = ',')
data.columns = ['dataid', 'datapath', 'method', 'nrrows', 'nrvals1', 'nrvals2', 
                'type1', 'type2', 'column1', 'column2', 'correlation']

# prepare data for training/testing
def get_label(row):
  if abs(row['correlation']) > 0.5:
    return 1
  else:
    return 0
pearson = data[data['method']=='pearson']
pearson['label'] = pearson.apply(get_label, axis=1)
print(pearson[['column1', 'column2', 'label']])

# split data into training and test set
def def_split(data):
  x_train, x_test, y_train, y_test = train_test_split(
      pearson[['column1', 'column2']], pearson['label'], 
      test_size=0.2, random_state=42)
  train = pd.concat([x_train, y_train], axis=1)
  test = pd.concat([x_test, y_test], axis=1)
  return train, test

def ds_split(data):
  counts = data['dataid'].value_counts()
  nr_vals = len(counts)
  print(counts)
  print(counts.shape)
  print(counts.iloc[0])
  nr_test_ds = int(nr_vals * 0.2)
  test_idx = rand.sample(range(0,nr_vals), nr_test_ds)
  test_ds = [counts.iloc[i] for i in test_idx]
  def is_test(row):
    if row['dataid'] in test_ds:
      return True
    else:
      return False
  data['istest'] = data.apply(is_test, axis=1)
  train = data[data['istest'] == False]
  test = data[data['istest'] == True]
  return train[['column1', 'column2', 'label']], test[['column1', 'column2', 'label']]

train, test = ds_split(pearson)
train.columns = ['text_a', 'text_b', 'labels']
test.columns = ['text_a', 'text_b', 'labels']
print(train.head())
print(test.head())

model_args = ClassificationArgs(num_train_epochs=10, train_batch_size=20,
                                overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel("roberta", "roberta-base", 
                            use_cuda = True, args=model_args, )
model.args.no_save = True
model.train_model(train)

In [None]:
result, outputs, failures = model.eval_model(test)
print(result)

  0%|          | 0/188 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/24 [00:00<?, ?it/s]

{'mcc': 0.4046093775115696, 'tp': 62, 'tn': 70, 'fp': 25, 'fn': 31, 'eval_loss': 1.9532377347944323}
