In [None]:
!pip install transformers
!pip install simpletransformers
!pip install scikit-learn
!pip install wandb

!echo "--- Disk ---"
!df -h
!echo ""
!echo "--- CPU ---"
!cat /proc/cpuinfo
!echo ""
!echo "--- Memory ---"
!cat /proc/meminfo
!echo ""
!echo "--- GPU ---"
!nvidia-smi -L

# Mount Google Drive with input data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn.metrics as metrics
import pandas as pd
import random as rand
import wandb

# initialize for deterministic results
seed = 0
rand.seed(seed)

# load data
path = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/corresult4.csv'
data = pd.read_csv(path, sep = ',')
data = data.sample(frac=1, random_state=seed)
data.columns = ['dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2', 
                'type1', 'type2', 'column1', 'column2', 'method',
                'coefficient', 'pvalue', 'time']

# divide data into subsets
pearson = data[data['method']=='pearson']
spearman = data[data['method']=='spearman']
theilsu = data[data['method']=='theilsu']

# generate and print data statistics
nr_ps = len(pearson.index)
nr_sm = len(spearman.index)
nr_tu = len(theilsu.index)
print(f'#Samples for Pearson: {nr_ps}')
print(f'#Samples for Spearman: {nr_sm}')
print(f'#Samples for Theil\'s u: {nr_tu}')

# |coefficient>0.5| -> label 1
def coefficient_label(row):
  if abs(row['coefficient']) > 0.5:
    return 1
  else:
    return 0
pearson['label'] = pearson.apply(coefficient_label, axis=1)
spearman['label'] = spearman.apply(coefficient_label, axis=1)
theilsu['label'] = theilsu.apply(coefficient_label, axis=1)

rc_p = len(pearson[pearson['label']==1].index)/nr_ps
rc_s = len(spearman[spearman['label']==1].index)/nr_sm
rc_u = len(theilsu[theilsu['label']==1].index)/nr_tu
print(f'Ratio correlated - Pearson: {rc_p}')
print(f'Ratio correlated - Spearman: {rc_s}')
print(f'Ratio correlated - Theil\s u: {rc_u}')

# split data into training and test set
def def_split(data):
  x_train, x_test, y_train, y_test = train_test_split(
      pearson[['column1', 'column2']], pearson['label'],
      test_size=0.2, random_state=seed)
  train = pd.concat([x_train, y_train], axis=1)
  test = pd.concat([x_test, y_test], axis=1)
  return train, test

def ds_split(data):
  counts = data['dataid'].value_counts()
  print(f'Counts: {counts}')
  print(f'Count.index: {counts.index}')
  print(f'Count.index.values: {counts.index.values}')
  print(f'counts.shape: {counts.shape}')
  print(f'counts.iloc[0]: {counts.iloc[0]}')
  nr_vals = len(counts)
  nr_test_ds = int(nr_vals * 0.2)
  print(f'Nr. test data sets: {nr_test_ds}')
  ds_ids = counts.index.values.tolist()
  print(type(ds_ids))
  print(ds_ids)
  test_ds = rand.sample(ds_ids, nr_test_ds)
  print(f'TestDS: {test_ds}')
  def is_test(row):
    if row['dataid'] in test_ds:
      return True
    else:
      return False
  data['istest'] = data.apply(is_test, axis=1)
  train = data[data['istest'] == False]
  test = data[data['istest'] == True]
  print(f'train.shape: {train.shape}')
  print(f'test.shape: {test.shape}')
  print(train)
  print(test)
  return train[['column1', 'column2', 'label']], test[['column1', 'column2', 'label']]

train, test = ds_split(pearson)
train.columns = ['text_a', 'text_b', 'labels']
test.columns = ['text_a', 'text_b', 'labels']
print(train.head())
print(test.head())

model_args = ClassificationArgs(num_train_epochs=10, train_batch_size=40,
                                overwrite_output_dir=True, manual_seed=seed,
                                evaluate_during_training=True, no_save=True,
                                wandb_project='CorrelationPredictionv1')
model = ClassificationModel("roberta", "roberta-base", weight=[1, 2],
                            use_cuda = True, args=model_args)
model.train_model(train_df=train, eval_df=test, acc=metrics.accuracy_score, 
    rec=metrics.recall_score, pre=metrics.precision_score, f1=metrics.f1_score)
wandb.join()
#output_dir='/content/drive/My Drive/Colab Notebooks/Liter/correlations/models'

#Samples for Pearson: 59935
#Samples for Spearman: 59935
#Samples for Theil's u: 119383


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Ratio correlated - Pearson: 0.3370985234003504
Ratio correlated - Spearman: 0.35636940018353214
Ratio correlated - Theil\s u: 0.5734484809394973
Counts: 3598    45
501     45
3348    45
1269    45
3316    45
        ..
4218     1
88       1
1975     1
1783     1
2037     1
Name: dataid, Length: 2764, dtype: int64
Count.index: Int64Index([3598,  501, 3348, 1269, 3316, 3156, 1013,  981, 2279,  294,
            ...
              42,  303, 3269,  399, 2542, 4218,   88, 1975, 1783, 2037],
           dtype='int64', length=2764)
Count.index.values: [3598  501 3348 ... 1975 1783 2037]
counts.shape: (2764,)
counts.iloc[0]: 45
Nr. test data sets: 552
<class 'list'>
[3598, 501, 3348, 1269, 3316, 3156, 1013, 981, 2279, 294, 629, 597, 565, 533, 422, 3412, 469, 486, 518, 550, 582, 614, 2004, 1972, 1062, 3303, 3335, 3605, 3380, 3444, 3494, 3396, 982, 2871, 1189, 630, 598, 566, 534, 502, 470, 3300, 3332, 3364, 3428, 3476, 4276, 3460, 3492, 3524, 3556, 1637, 3812, 3876, 3604, 3572, 3540, 3508, 3573, 35

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


train.shape: (47748, 15)
test.shape: (12187, 15)
        dataid  ... istest
16651      418  ...  False
40486      611  ...  False
227113    4011  ...  False
227179    4011  ...  False
74345     1269  ...  False
...        ...  ...    ...
52620      832  ...  False
170584    3360  ...  False
211543    3702  ...  False
112420    2165  ...  False
122579    2403  ...  False

[47748 rows x 15 columns]
        dataid  ... istest
231947    4120  ...   True
136744    2700  ...   True
236252    4207  ...   True
84506     1436  ...   True
119070    2323  ...   True
...        ...  ...    ...
138084    2722  ...   True
236484    4211  ...   True
168691    3346  ...   True
108631    2089  ...   True
17089      421  ...   True

[12187 rows x 15 columns]
        text_a        text_b  labels
16651      JUL           FEB       1
40486   src_ip  num_pkts_out       0
227113       4             2       1
227179       7             5       1
74345       Vp     Gamma-ray       0
                  text_a   

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

  0%|          | 0/47748 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/1194 [00:00<?, ?it/s]

In [11]:
import torch
from simpletransformers.classification import ClassificationModel
import sklearn.metrics as metrics

model = ClassificationModel('roberta', '/content/drive/My Drive/Colab Notebooks/Liter/correlations/models/checkpoint-865-epoch-1')
result, outputs, failures = model.eval_model(
    test, acc=metrics.accuracy_score, rec=metrics.recall_score, 
    pre=metrics.precision_score, f1=metrics.f1_score)
print(result)
test_samples = []
for idx, r in test.iterrows():
  test_samples.append([r['text_a'], r['text_b']])
pred = model.predict(test_samples)
test['pred'] = pred[0]
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)
print(test)

  0%|          | 0/9050 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1132 [00:00<?, ?it/s]

{'mcc': 0.5505156735364627, 'tp': 2046, 'tn': 5197, 'fp': 675, 'fn': 1132, 'acc': 0.8003314917127072, 'rec': 0.643801132787917, 'pre': 0.7519294377067255, 'f1': 0.6936768943888794, 'eval_loss': 0.4113608933113147}


  0%|          | 0/9050 [00:00<?, ?it/s]

  0%|          | 0/1132 [00:00<?, ?it/s]

                                                   text_a          text_b  labels  pred
56                                               Vehicles              ID       0     0
110                                                  Year           Month       0     0
118                                 Residential Customers           Month       0     0
121                                 Residential Customers            Year       0     0
126     Residential Customers with Arrears Greater tha...           Month       0     0
...                                                   ...             ...     ...   ...
182653                                              Theft         Robbery       1     1
182656                                              Theft  Street_robbery       1     1
182659                                              Theft          Injury       1     1
182662                                              Theft     Agg_assault       1     1
182665                          