In [None]:
!pip install transformers
!pip install simpletransformers
!pip install scikit-learn

!echo "--- Disk ---"
!df -h
!echo ""
!echo "--- CPU ---"
!cat /proc/cpuinfo
!echo ""
!echo "--- Memory ---"
!cat /proc/meminfo
!echo ""
!echo "--- GPU ---"
!nvidia-smi -L

# Mount Google Drive with input data
from google.colab import drive
drive.mount('/content/drive')

In [60]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
import pandas as pd
import random as rand

# load data
path = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/corsample.txt'
data = pd.read_csv(path, sep = ',')
data.columns = ['dataid', 'datapath', 'method', 'nrrows', 'nrvals1', 'nrvals2', 
                'type1', 'type2', 'column1', 'column2', 'correlation']

# prepare data for training/testing
def get_label(row):
  if abs(row['correlation']) > 0.5:
    return 1
  else:
    return 0
pearson = data[data['method']=='pearson']
pearson['label'] = pearson.apply(get_label, axis=1)
print(pearson[['column1', 'column2', 'label']])

# split data into training and test set
def def_split(data):
  x_train, x_test, y_train, y_test = train_test_split(
      pearson[['column1', 'column2']], pearson['label'], 
      test_size=0.2, random_state=42)
  train = pd.concat([x_train, y_train], axis=1)
  test = pd.concat([x_test, y_test], axis=1)
  return train, test

def ds_split(data):
  counts = data['dataid'].value_counts()
  nr_vals = len(counts)
  print(counts)
  print(counts.shape)
  print(counts.iloc[0])
  nr_test_ds = int(nr_vals * 0.2)
  test_idx = rand.sample(range(0,nr_vals), nr_test_ds)
  test_ds = [counts.iloc[i] for i in test_idx]
  def is_test(row):
    if row['dataid'] in test_ds:
      return True
    else:
      return False
  data['istest'] = data.apply(is_test, axis=1)
  train = data[data['istest'] == False]
  test = data[data['istest'] == True]
  return train[['column1', 'column2', 'label']], test[['column1', 'column2', 'label']]

train, test = ds_split(pearson)
train.columns = ['text_a', 'text_b', 'labels']
test.columns = ['text_a', 'text_b', 'labels']
print(train.head())
print(test.head())

model_args = ClassificationArgs(num_train_epochs=10, train_batch_size=20,
                                overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel("roberta", "roberta-base", 
                            use_cuda = True, args=model_args, )
model.args.no_save = True
model.train_model(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                                                column1  ... label
0                                              location  ...     0
1                                              location  ...     0
2                                                  size  ...     0
3                                                  size  ...     0
4                                                  size  ...     0
...                                                 ...  ...   ...
4820  Percentage of unemployed studying ads in job s...  ...     1
4821  Percentage of unemployed studying ads in job s...  ...     1
4822  Percentage of unemployed studying ads in job s...  ...     1
4823  Percentage of unemployed studying ads in job s...  ...     1
4824  Percentage of unemployed studying ads in job s...  ...     1

[1621 rows x 3 columns]
22    276
21    153
20    105
43     91
45     91
2      91
33     91
39     78
48     78
47     78
53     66
52     66
51     66
40     66
1      35
35     28
41     21
3

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

  0%|          | 0/1433 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/72 [00:00<?, ?it/s]

(720, 0.1451617690329941)

In [61]:
result, outputs, failures = model.eval_model(test)
print(result)

  0%|          | 0/188 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/24 [00:00<?, ?it/s]

{'mcc': 0.4046093775115696, 'tp': 62, 'tn': 70, 'fp': 25, 'fn': 31, 'eval_loss': 1.9532377347944323}
