In [298]:
# pair stance classification
import pandas as pd
import os, sys
import importlib
import numpy as np
sys.path.append("..")


import experimenter
from experimenter import evaluation
from experimenter.utils import modeling, training, data, text
importlib.reload(experimenter)
importlib.reload(text)
importlib.reload(modeling)
importlib.reload(evaluation)
importlib.reload(training)
importlib.reload(data)

configs = {"epochs": 5,
           "experiment_output_path": "final_training.json", 
           "log_interval": 1, "model_path": "model.state", 
           "processor": {
                "module": "experimenter.utils.data",
                "class": "PairStanceProvider",
                "params":{"input_path": "/Users/jkhouja/workspace/repo/arabic_media/data/pairs/batch_0_to_15000_pairs_eq_weight_random.csv",
                          "seq_len": {'inp':[50,50], 'label':[1],'mask':[1]}, "batch_size": 4, "splits": [.1, .2, .1, .6], "drop_last":"False",  "shuffle": "True", "vocab_path": "vocab.json"}},
           "model":{
                "module": "experimenter.utils.modeling",
                "class": "RNNPairModel",
                "params":{"embedding_dim": 8, "hidden_dim": 50, "num_classes": 3, "dropout": 0, "max_seq_len": {"eval": 1, "value": "config['processor']['params']['seq_len']['inp'][0]"}}},
           
            "evaluator":{
                "module": "experimenter.evaluation",
                "class": "ListEvaluator",
                "params": {"loss_f": [{"module": 'torch.nn', "class": 'CrossEntropyLoss', "params": {'reduction': 'none'}}]}},
            "optimizer":{
                "module": "torch.optim",
                "class": "Adam",
                "params":{
                    "params": {
                    "eval": 1,
                    "value": "config['model']['model'].parameters()"},
                'lr':  0.001}}
           }


configs2 = {"epochs": 5,
           "experiment_output_path": "final_training.json", 
           "log_interval": 1, "model_path": "model.state", 
           "processor": {
                "module": "experimenter.utils.data",
                "class": "PairStanceProvider",
                "params":{"input_path": "/Users/jkhouja/workspace/repo/arabic_media/data/pairs/batch_0_to_15000_pairs_eq_weight_random.csv",
                          "seq_len": {'inp':[50,50], 'label':[1],'mask':[1]}, "batch_size": 4, "splits": [.1, .2, .1, .6], "drop_last":"False",  "shuffle": "True", "vocab_path": "vocab.json"}},
           "model":{
                "module": "experimenter.utils.modeling",
                "class": "RNNPairModel",
                "params":{"embedding_dim": 8, "hidden_dim": 50, "num_classes": 3, "dropout": 0, "max_seq_len": {"eval": 1, "value": "config['processor']['params']['seq_len']['inp'][0]"}}},
           
            "evaluator":{
                "module": "experimenter.evaluation",
                "class": "ListEvaluator",
                "params": {"loss_f": [{"module": 'torch.nn', "class": 'CrossEntropyLoss', "params": {'reduction': 'none'}}]}},
            "optimizer":{
                "module": "torch.optim",
                "class": "Adam",
                "params":{
                    "params": {
                    "eval": 1,
                    "value": "config['model']['model'].parameters()"},
                'lr':  0.001}}
           }

#unlabeled_data = [{'inp':["يالت رجل", "قال رجلي"],'label':[['agree']],'mask':[1]}]*10


# Initialize two models
trainer_smart = training.BasicTrainer(configs)
trainer_random = training.BasicTrainer(configs2)

# Initialize data from file

val_data = trainer_smart.processor.get_data()[1]
test_data = trainer_smart.processor.get_data()[2]
initial_data = trainer_smart.processor.get_data()[0]

unlabeled_data = trainer_smart.processor.decode(trainer_smart.processor.data_raw[3], list_input=True)

res_smart = trainer_smart.train_model([initial_data, val_data, test_data])
res_random = trainer_random.train_model([initial_data, val_data, test_data])


Will create train, dev, test(s) splits
Total params: 12893
Will create train, dev, test(s) splits
Total params: 12893
Starting training:
Epoch: 1: Train loss (last batch): 1.0959333181381226, validation loss: 1.0971243381500244
Best model saved at: model.state
Epoch: 2: Train loss (last batch): 1.1127394437789917, validation loss: 1.0927706956863403
Best model saved at: model.state
Epoch: 3: Train loss (last batch): 1.2162951231002808, validation loss: 1.0798826217651367
Best model saved at: model.state
Epoch: 4: Train loss (last batch): 1.1723943948745728, validation loss: 1.0638456344604492
Best model saved at: model.state
Epoch: 5: Train loss (last batch): 1.0028953552246094, validation loss: 1.0499619245529175
Best model saved at: model.state
Test loss: 1.0245028734207153
Starting training:
Epoch: 1: Train loss (last batch): 1.0776337385177612, validation loss: 1.0983085632324219
Best model saved at: model.state
Epoch: 2: Train loss (last batch): 1.1173642873764038, validation loss

In [300]:
# Predict on unlabled data
def get_ent(preds):
    res = []
    for example in preds:
        ent = 0
        for p in example:
            ent += -p * np.log2(p)
        res.append(ent)
        
    return res

import random
top_n = 200
num_steps = range(10)
for step in num_steps:
    print("step {}".format(step))
    pred = trainer_smart.predict(unlabeled_data, decode=True)

    outs = [o for p in pred for o in p['meta']]
    ent = get_ent(outs)
    ent = np.array(ent)
    print(ent.mean())
    idx = (-ent).argsort()[:top_n]
    #print("selected examples based on entropy: ")
    #print(idx)
    filtered = [unlabeled_data[s] for s in idx]
    random_d = [unlabeled_data[s] for s in random.sample(range(len(unlabeled_data)), top_n)]

    # Update remaining
    print("Remaining data before / after filtering the learned batch")
    print(len(unlabeled_data))
    unlabeled_data = [s for i, s in enumerate(unlabeled_data) if i not in idx]
    print(len(unlabeled_data))

    #print(random)
    #print(filtered)

    #Retrain
    filtered_encoded = trainer_smart.processor(filtered, data_type='full', list_input= True, as_batches=True)
    random_encoded = trainer_smart.processor(random_d, data_type='full', list_input= True, as_batches=True)
    res_smart = trainer_smart.train_model([filtered_encoded, val_data, test_data])
    res_random = trainer_random.train_model([random_encoded, val_data, test_data])


step 0
1.4463678059057392
Remaining data before / after filtering the learned batch
5052
4852
Starting training:
Epoch: 11: Train loss (last batch): 1.0623605251312256, validation loss: 1.0581321716308594
Best model saved at: model.state
Epoch: 12: Train loss (last batch): 0.7871716618537903, validation loss: 1.0428305864334106
Best model saved at: model.state
Epoch: 13: Train loss (last batch): 1.1186940670013428, validation loss: 1.0667556524276733
Epoch: 14: Train loss (last batch): 1.3207297325134277, validation loss: 1.088484525680542
Epoch: 15: Train loss (last batch): 1.049099326133728, validation loss: 1.0591458082199097
Test loss: 1.0410854816436768
Starting training:
Epoch: 11: Train loss (last batch): 0.9175735712051392, validation loss: 1.0657908916473389
Best model saved at: model.state
Epoch: 12: Train loss (last batch): 1.0900952816009521, validation loss: 1.0391796827316284
Best model saved at: model.state
Epoch: 13: Train loss (last batch): 0.9136359691619873, validati

Best model saved at: model.state
Test loss: 0.9901295900344849
Starting training:
Epoch: 41: Train loss (last batch): 1.2643591165542603, validation loss: 1.0451629161834717
Best model saved at: model.state
Epoch: 42: Train loss (last batch): 1.170201063156128, validation loss: 1.052954912185669
Epoch: 43: Train loss (last batch): 0.956839919090271, validation loss: 1.0390136241912842
Best model saved at: model.state
Epoch: 44: Train loss (last batch): 1.3444180488586426, validation loss: 1.0519754886627197
Epoch: 45: Train loss (last batch): 0.9354345798492432, validation loss: 1.061521053314209
Test loss: 1.0727882385253906
step 7
1.3193084428125657
Remaining data before / after filtering the learned batch
3652
3452
Starting training:
Epoch: 46: Train loss (last batch): 0.9979891777038574, validation loss: 1.013409972190857
Best model saved at: model.state
Epoch: 47: Train loss (last batch): 1.0384202003479004, validation loss: 1.0270806550979614
Epoch: 48: Train loss (last batch): 0

In [320]:
res_smart['results']

{'during_training': {'1': {'train_loss': 1.0959333181381226,
   'val_loss': 1.0971243381500244},
  '2': {'train_loss': 1.1127394437789917, 'val_loss': 1.0927706956863403},
  '3': {'train_loss': 1.2162951231002808, 'val_loss': 1.0798826217651367},
  '4': {'train_loss': 1.1723943948745728, 'val_loss': 1.0638456344604492},
  '5': {'train_loss': 1.0028953552246094, 'val_loss': 1.0499619245529175},
  '6': {'train_loss': 1.090139627456665, 'val_loss': 1.0430418252944946},
  '7': {'train_loss': 0.9486572742462158, 'val_loss': 1.050020694732666},
  '8': {'train_loss': 1.1977633237838745, 'val_loss': 1.0444697141647339},
  '9': {'train_loss': 1.0246580839157104, 'val_loss': 1.0404095649719238},
  '10': {'train_loss': 0.6556094288825989, 'val_loss': 1.0469732284545898},
  '11': {'train_loss': 1.0623605251312256, 'val_loss': 1.0581321716308594},
  '12': {'train_loss': 0.7871716618537903, 'val_loss': 1.0428305864334106},
  '13': {'train_loss': 1.1186940670013428, 'val_loss': 1.0667556524276733},
 

In [171]:
res_random['results']

{'during_training': {'1': {'train_loss': 0.9932723045349121,
   'val_loss': 1.1460973024368286},
  '2': {'train_loss': 1.8655883073806763, 'val_loss': 0.9856749773025513},
  '3': {'train_loss': 0.5568922758102417, 'val_loss': 0.9643186330795288},
  '4': {'train_loss': 0.8071258664131165, 'val_loss': 0.9565287828445435},
  '5': {'train_loss': 0.5423922538757324, 'val_loss': 0.9556002020835876},
  '6': {'train_loss': 0.9349740743637085, 'val_loss': 0.9569883942604065},
  '7': {'train_loss': 1.3021361827850342, 'val_loss': 0.9552294015884399},
  '8': {'train_loss': 0.726514458656311, 'val_loss': 0.9544289708137512},
  '9': {'train_loss': 1.0989142656326294, 'val_loss': 0.955574095249176},
  '10': {'train_loss': 0.4896788001060486, 'val_loss': 0.955385684967041},
  '11': {'train_loss': 1.331813097000122, 'val_loss': 0.955652117729187},
  '12': {'train_loss': 1.322219729423523, 'val_loss': 0.956355631351471},
  '13': {'train_loss': 0.5277446508407593, 'val_loss': 0.9499547481536865},
  '14'

In [319]:
ent.argmax()
ent[idx[100]]
ent[ent.argmax()]
unlabeled_data[idx[100]]
trainer_random.predict([unlabeled_data[idx[100]], unlabeled_data[idx[100]]])

[{'inp': ['مواطن فرنسي يواجه المحاكمة في القدس لتهريبه أسلحة ',
   'فرنسي يفلت من المحاكمة في القدس<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'],
  'label': [['disagree']],
  'mask': [1],
  'out': [[-0.16341207921504974, 0.45698094367980957, -0.5681448578834534]],
  'pred': [['other']],
  'meta': [[0.28354206681251526, 0.527291476726532, 0.18916653096675873]]},
 {'inp': ['مواطن فرنسي يواجه المحاكمة في القدس لتهريبه أسلحة ',
   'فرنسي يفلت من المحاكمة في القدس<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'],
  'label': [['disagree']],
  'mask': [1],
  'out': [[-0.16341206431388855, 0.45698079466819763, -0.5681449174880981]],
  'pred': [['other']],
  'meta': [[0.28354209661483765, 0.527291476726532, 0.18916653096675873]]}]