In [3]:
# pair stance classification
import pandas as pd
import os, sys
import importlib
import numpy as np
sys.path.append("..")
import copy


import experimenter
from experimenter import evaluation
from experimenter.utils import modeling, training, data, text, utils
importlib.reload(experimenter)
importlib.reload(text)
importlib.reload(modeling)
importlib.reload(evaluation)
importlib.reload(training)
importlib.reload(data)
importlib.reload(utils)

import logging

configs = {"epochs": 12,
            "experiment_name": "PairStanceClassification",
             "root_path" : "/Users/jkhouja/workspace/experiments/arabic_media/",
             "experiment_output_file": "results.json",
             "model_path": "model.state",
             "log_interval": 1, 
             "disable_gpu": False,
           "processor": {
                "module": "experimenter.utils.data",
                "class": "PairStanceProvider",
                "params":{"input_path": "data/batch_0_to_15000_pairs_sep__score_10_ngrams_2_3_4_5_6.csv",
                          "seq_len": {'inp':[40,40], 'label':[1],'mask':[1]}, "batch_size": 4, "splits": [.7, .2, .1], "drop_last":True,  "shuffle": True, "vocab_path": "vocab.json"}},
           "model":{
                "module": "experimenter.utils.modeling",
                "class": "RNNPairModel",
                "params":{"embedding_dim": 10, "hidden_dim": 60, "num_classes": 3, "dropout": 0, "max_seq_len": {"eval": 1, "value": "config['processor']['params']['seq_len']['inp'][0]"}}},
           
            "evaluator":{
                "module": "experimenter.evaluation",
                "class": "ListEvaluator",
                "params": {"loss_f": [{"module": 'torch.nn', "class": 'CrossEntropyLoss', "params": {'reduction': 'none'}}],
                          "metrics_f": [{"module": 'experimenter.evaluation', "class": 'Accuracy', "params": {}}]}},
            "optimizer":{
                "module": "torch.optim",
                "class": "Adam",
                "params":{
                    "params": {
                    "eval": 1,
                    "value": "config['model']['model'].parameters()"},
                'lr':  0.001}}
           }



lm_configs = {"epochs": 5,
            "experiment_name": "LanguageModeling",
             "root_path" : "/Users/jkhouja/workspace/experiments/arabic_media/",
             "experiment_output_file": "results.json",
             "model_path": "model.state",
             "log_interval": 1, 
             "disable_gpu": False,
           "processor": {
                "module": "experimenter.utils.data",
                "class": "LMProvider",
                "params":{"input_path": "/Users/jkhouja/workspace/repo/arabic_media/data/pairs/batch_0_to_15000_pairs_eq_weight_random.csv",
                          "seq_len": {'inp':[50], 'label':[50],'mask':[1]}, "batch_size": 4, "splits": [.1, .2, .1, .6], "drop_last":"False",  "shuffle": "True", "vocab_path": "local/vocab.json"}},
           "model":{
                "module": "experimenter.utils.modeling",
                "class": "RNNLMModel",
                "params":{"embedding_dim": 8, "hidden_dim": 30, "dropout": 0, "max_seq_len": {"eval": 1, "value": "config['processor']['params']['seq_len']['inp'][0]"}}},
           
            "evaluator":{
                "module": "experimenter.evaluation",
                "class": "ListEvaluator",
                "params": {"loss_f": [{"module": 'torch.nn', "class": 'CrossEntropyLoss', "params": {'reduction': 'none'}}]}},
            "optimizer":{
                "module": "torch.optim",
                "class": "Adam",
                "params":{
                    "params": {
                    "eval": 1,
                    "value": "config['model']['model'].parameters()"},
                'lr':  0.001}}
           }

multi_configs = {"epochs": 25,
           "experiment_name": "MultiTaskModeling",
             "root_path" : "/Users/jkhouja/workspace/experiments/arabic_media/",
             "experiment_output_file": "results.json",
             "model_path": "model.state",
             "log_interval": 1, 
             "disable_gpu": False,
           "processor": {
                "module": "experimenter.utils.data",
                "class": "MultiLMPairProvider",
                "params":{"input_path": "/Users/jkhouja/workspace/repo/arabic_media/data/for training/batch_0_to_15000_pairs_eq_weight_random.csv",
                          "seq_len": {'inp':[50, 50], 'label':[50, 1],'mask':[1, 1]}, "batch_size": 4, "splits": [.7, .2, .1], "drop_last":True,  "shuffle": True, "vocab_path": "local/vocab.json"}},
           "model":{
                "module": "experimenter.utils.modeling",
                "class": "RNNMultiLMPairModel",
                "params":{"embedding_dim": 20, "hidden_dim": 100, "dropout": 0, "lm_classes": {"eval": 1, "value":"config['processor']['params']['vocab_size']"},  "num_classes":3 , "max_seq_len": {"eval": 1, "value": "config['processor']['params']['seq_len']['inp'][0]"}}},
           
            "evaluator":{
                "module": "experimenter.evaluation",
                "class": "ListEvaluator",
                "params": {"loss_f": [{"module": 'torch.nn', "class": 'CrossEntropyLoss', "params": {'reduction': 'none'}}, {"module": 'torch.nn', "class": 'CrossEntropyLoss', "params": {'reduction': 'none'}}],
                          "metrics_f": [{"module": 'experimenter.evaluation', "class": 'Dummy', "params": {}}, {"module": 'experimenter.evaluation', "class": 'Accuracy', "params": {}}]}},
            "optimizer":{
                "module": "torch.optim",
                "class": "Adam",
                "params":{
                    "params": {
                    "eval": 1,
                    "value": "config['model']['model'].parameters()"},
                'lr':  0.001}}
           }

#unlabeled_data = [{'inp':["يالت رجل", "قال رجلي"],'label':[['agree']],'mask':[1]}]*10



# Initialize two models

logger = logging.getLogger()
config =copy.deepcopy(multi_configs)
trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()


KeyboardInterrupt: 

In [8]:
# Evaluate on train
selected_data = trainer_smart.processor.get_data()
trainer_smart._evaluate_batches(selected_data[0])


[0.0, 0.6030372305682561]

In [14]:
# Another try to mast [1,2] - more weight on stance classification. By changing code in 
importlib.reload(training)
importlib.reload(data)
importlib.reload(evaluation)

import copy



config = copy.deepcopy(multi_configs)

trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()


Will create train, dev, test(s) splits
Total params: 59104
Starting training:
Epoch: 1: Train loss (last batch): 195.17881774902344, validation metrics: 0.0,0.465675057208238
Epoch: 2: Train loss (last batch): 142.82186889648438, validation metrics: 0.0,0.5606407322654462
Epoch: 3: Train loss (last batch): 168.8651123046875, validation metrics: 0.0,0.6344393592677345
Epoch: 4: Train loss (last batch): 108.60322570800781, validation metrics: 0.0,0.6767734553775744
Epoch: 5: Train loss (last batch): 155.85281372070312, validation metrics: 0.0,0.6842105263157895
Epoch: 6: Train loss (last batch): 131.98126220703125, validation metrics: 0.0,0.6956521739130435
Epoch: 7: Train loss (last batch): 119.71443176269531, validation metrics: 0.0,0.7105263157894737
Epoch: 8: Train loss (last batch): 105.26661682128906, validation metrics: 0.0,0.7254004576659039
Epoch: 9: Train loss (last batch): 108.87906646728516, validation metrics: 0.0,0.7465675057208238
Epoch: 10: Train loss (last batch): 124.81

In [15]:
# Another try with adding s2 to LM part with 0 mask on stance
importlib.reload(training)
importlib.reload(data)
importlib.reload(evaluation)

import copy



config = copy.deepcopy(multi_configs)

trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()

Will create train, dev, test(s) splits
Total params: 59104
Starting training:
Epoch: 1: Train loss (last batch): 113.81739807128906, validation metrics: 0.0,0.4582857142857143
Epoch: 2: Train loss (last batch): 137.22171020507812, validation metrics: 0.0,0.49057142857142855
Epoch: 3: Train loss (last batch): 102.43212890625, validation metrics: 0.0,0.5391428571428571
Epoch: 4: Train loss (last batch): 132.57327270507812, validation metrics: 0.0,0.5731428571428572
Epoch: 5: Train loss (last batch): 96.04551696777344, validation metrics: 0.0,0.6034285714285714
Epoch: 6: Train loss (last batch): 93.68408203125, validation metrics: 0.0,0.5945714285714285
Epoch: 7: Train loss (last batch): 83.46720886230469, validation metrics: 0.0,0.5962857142857143
Epoch: 8: Train loss (last batch): 113.39315032958984, validation metrics: 0.0,0.5988571428571429
Epoch: 9: Train loss (last batch): 80.28907775878906, validation metrics: 0.0,0.5805714285714285
Epoch: 10: Train loss (last batch): 74.1262054443

In [16]:
# Another try with adding s2 to LM part with 0 mask on stance
importlib.reload(training)
importlib.reload(data)
importlib.reload(evaluation)

import copy



config = copy.deepcopy(multi_configs)

trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()

Will create train, dev, test(s) splits
Total params: 59104
Starting training:
Epoch: 1: Train loss (last batch): 209.2574462890625, validation metrics: 0.0,0.5828571428571429
Epoch: 2: Train loss (last batch): 144.80616760253906, validation metrics: 0.0,0.6908571428571428
Epoch: 3: Train loss (last batch): 133.73956298828125, validation metrics: 0.0,0.7285714285714285
Epoch: 4: Train loss (last batch): 143.59291076660156, validation metrics: 0.0,0.7768571428571428
Epoch: 5: Train loss (last batch): 112.75733947753906, validation metrics: 0.0,0.8194285714285714
Epoch: 6: Train loss (last batch): 103.4307632446289, validation metrics: 0.0,0.8394285714285714
Epoch: 7: Train loss (last batch): 101.30254364013672, validation metrics: 0.0,0.8665714285714285
Epoch: 8: Train loss (last batch): 100.20878601074219, validation metrics: 0.0,0.8842857142857142
Epoch: 9: Train loss (last batch): 96.01273345947266, validation metrics: 0.0,0.8848571428571429
Epoch: 10: Train loss (last batch): 90.1298

In [17]:
# putting all stance weight to zero
importlib.reload(training)
importlib.reload(data)
importlib.reload(evaluation)

import copy



config = copy.deepcopy(multi_configs)

trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()

Will create train, dev, test(s) splits
Total params: 59104
Starting training:
Epoch: 1: Train loss (last batch): 122.08667755126953, validation metrics: 0.0,0.3438215102974828
Epoch: 2: Train loss (last batch): 94.79762268066406, validation metrics: 0.0,0.33695652173913043
Epoch: 3: Train loss (last batch): 95.37557220458984, validation metrics: 0.0,0.33466819221967964
Epoch: 4: Train loss (last batch): 75.7430648803711, validation metrics: 0.0,0.33352402745995424
Epoch: 5: Train loss (last batch): 68.06087493896484, validation metrics: 0.0,0.3306636155606407
Epoch: 6: Train loss (last batch): 69.52253723144531, validation metrics: 0.0,0.31864988558352403
Epoch: 7: Train loss (last batch): 77.9913558959961, validation metrics: 0.0,0.3295194508009153
Epoch: 8: Train loss (last batch): 72.39769744873047, validation metrics: 0.0,0.3232265446224256
Epoch: 9: Train loss (last batch): 67.80549621582031, validation metrics: 0.0,0.3169336384439359
Epoch: 10: Train loss (last batch): 77.7780609

In [254]:
# putting all stance weight to 150 and s2 LM data
importlib.reload(training)
importlib.reload(data)
importlib.reload(evaluation)
importlib.reload(modeling)

import copy



config = copy.deepcopy(multi_configs)

config['epochs'] = 60

trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()

Will create train, dev, test(s) splits
Total params: 58807
Starting training:
Epoch: 1: Train loss (last batch): 167.3178253173828, validation metrics: 0.0,0.44142857142857145
Epoch: 2: Train loss (last batch): 156.54412841796875, validation metrics: 0.0,0.48028571428571426
Epoch: 3: Train loss (last batch): 101.2757568359375, validation metrics: 0.0,0.5262857142857142
Epoch: 4: Train loss (last batch): 115.44377136230469, validation metrics: 0.0,0.5597142857142857
Epoch: 5: Train loss (last batch): 104.06786346435547, validation metrics: 0.0,0.5888571428571429
Epoch: 6: Train loss (last batch): 146.06484985351562, validation metrics: 0.0,0.6022857142857143
Epoch: 7: Train loss (last batch): 67.07022094726562, validation metrics: 0.0,0.6191428571428571
Epoch: 8: Train loss (last batch): 111.83159637451172, validation metrics: 0.0,0.6354285714285715
Epoch: 9: Train loss (last batch): 85.94422912597656, validation metrics: 0.0,0.6362857142857142
Epoch: 10: Train loss (last batch): 102.94

KeyboardInterrupt: 

In [20]:
# Update code to split before pairs.  0.0,0.6291428571428571
importlib.reload(training)
importlib.reload(data)
importlib.reload(evaluation)
importlib.reload(modeling)

import copy



config = copy.deepcopy(multi_configs)

config['epochs'] = 20

trainer_smart = training.BasicTrainer(config)

res_smart = trainer_smart.train_model()

All loaded data size:8754
Will create train, dev, test(s) splits


AttributeError: 'encoder' object has no attribute 'freeze'

In [15]:
#unlabeled_data = trainer_smart.processor.upload_data()
s1 = 'فتح تحقيق في قضية موت مريم مصطفى في مدينة باري'
s2 = s1
data2 = [{'inp': [s1, s2], 'label':[s1, [1]], 'mask':[1, 1]}] * 4
pred = trainer_smart.predict(data2, decode=False)
#data
print([trainer_smart.processor.encoder['label'][1]._funcs[0].decode(p['pred'][1]) for p in pred])
print([p['meta'][1] for p in pred])

[['disagree'], ['disagree'], ['disagree'], ['disagree']]
[1.0, 1.0, 1.0, 1.0]


In [18]:
d = trainer_smart.processor.decode(trainer_smart.processor.data_raw[2][0:1000], list_input=True)
pred = trainer_smart.predict(d, decode=False)


res = pd.DataFrame([(trainer_smart.processor.decoder['inp'][0](pred[i]['inp'][0]), trainer_smart.processor.decoder['inp'][1](pred[i]['inp'][1]), trainer_smart.processor.encoder['label'][1]._funcs[0].decode(pred[i]['label'][1])[0], trainer_smart.processor.encoder['label'][1]._funcs[0].decode(pred[i]['pred'][1])[0], pred[i]['meta'][1]) for i in range(len(d))])

res['correct'] = res[2] == res[3]
print(res['correct'].sum()  / res.shape[0])

res



0.614


Unnamed: 0,0,1,2,3,4,correct
0,Sمصر تدعم الناقص في سد النهض,Sألمانيا تساهم في تعويض النقص بسد النهضة الأثيوب,disagree,agree,0.496307,False
1,Sرئيس الوزراء الأردنى يرفض الاستقالة و يواصل ع...,Sاستقالة رئيس الوزراء الأردنى بعد الاحتجاجات ا...,disagree,agree,0.366510,False
2,Sمن الذين يشملهم قرار مصادرة أملاك رموز حكم صدام,Sالجزائر: الصراع متواصل بين رئيس البرلمان ونوا...,other,other,-0.113336,True
3,Sمصر تتراجع في التصنيف الائتمان,Sتحقيق أفضل تصنيف ائتماني لمصر في ٧ أعوا,disagree,agree,0.409297,False
4,Sارتفاع على النفط عالميا والسبب الخام الأميرك,Sالخام الاميركي يسبب انخفاضا في اسعار النف,disagree,disagree,0.609051,True
...,...,...,...,...,...,...
995,Sتوقعات خفض أسعار الفائدة يهبط بالدولا,Sرفع أسعار الفائدة يرفع الدولار لأعلى مستو,disagree,agree,0.460125,False
996,Sأردوغان يهاجم قانون القومية اليهودية ويصف إسر...,Sدعوة للعمل 4 أيام بالأسبوع فقط.. مع راتب أعل,other,agree,0.239946,False
997,Sاستهداف مطار في طرابلس بهجوم صاورخى و اجبار ل...,Sهجوم صاروخي يستهدف مطار في طرابلس ويجبر ليبيا...,agree,agree,0.290395,True
998,Sالنفط يهوي لضعف الطل,Sطلب قوي يرفع أسعار النف,disagree,agree,0.421226,False


In [19]:
res.groupby(2).mean()

Unnamed: 0_level_0,4,correct
2,Unnamed: 1_level_1,Unnamed: 2_level_1
agree,0.443043,0.650568
disagree,0.538146,0.398204
other,0.078912,0.802548


In [242]:
for i in pred:
    print(i['meta'][0])

[0.30543801188468933, 0.4390051066875458, 0.2555569112300873]
[0.4023689925670624, 0.19882814586162567, 0.39880287647247314]
[0.4211263954639435, 0.12805327773094177, 0.45082035660743713]
[0.40903475880622864, 0.024560557678341866, 0.5664046406745911]
[0.36775481700897217, 0.29465559124946594, 0.33758965134620667]
[0.41963261365890503, 0.13530725240707397, 0.445060133934021]
[0.34926971793174744, 0.3398396372795105, 0.31089067459106445]
[0.42179811000823975, 0.12456441670656204, 0.4536374807357788]
[0.41483062505722046, 0.15566132962703705, 0.4295080304145813]
[0.4199265241622925, 0.13392706215381622, 0.4461464583873749]
[0.3944302797317505, 0.22286103665828705, 0.38270875811576843]
[0.4183809459209442, 0.03715715929865837, 0.5444619655609131]
[0.38935214281082153, 0.23740479350090027, 0.3732430934906006]
[0.4252322316169739, 0.1023598313331604, 0.4724079370498657]
[0.23391839861869812, 0.586922824382782, 0.1791587918996811]
[0.3276577293872833, 0.3898504078388214, 0.28249186277389526]

In [203]:
trainer_smart.processor.data_raw[0][0]

{'inp': [[2,
   34,
   3,
   7,
   27,
   8,
   21,
   19,
   8,
   5,
   10,
   21,
   23,
   5,
   14,
   8,
   21,
   19,
   8,
   16,
   22,
   31,
   38,
   8,
   16,
   19,
   24,
   5,
   10,
   19,
   8,
   3,
   5,
   31,
   9,
   8,
   7,
   7,
   34,
   13,
   5,
   3,
   8,
   5,
   7,
   19,
   16,
   10,
   19,
   18,
   8,
   21,
   19,
   8,
   9,
   24],
  [2,
   23,
   5,
   4,
   3,
   10,
   8,
   31,
   19,
   31,
   14,
   8,
   19,
   38,
   22,
   31,
   8,
   9,
   5,
   14,
   20,
   18,
   8,
   5,
   7,
   37,
   35,
   19,
   5,
   43,
   8,
   12,
   5,
   19,
   7,
   19,
   8,
   31,
   5,
   7,
   24,
   13,
   19]],
 'label': [[34,
   3,
   7,
   27,
   8,
   21,
   19,
   8,
   5,
   10,
   21,
   23,
   5,
   14,
   8,
   21,
   19,
   8,
   16,
   22,
   31,
   38,
   8,
   16,
   19,
   24,
   5,
   10,
   19,
   8,
   3,
   5,
   31,
   9,
   8,
   7,
   7,
   34,
   13,
   5,
   3,
   8,
   5,
   7,
   19,
   16,
   10,
   19,
   18,
   8,
   21,

In [230]:
trainer_smart.model.num_classes

3

[2]

In [205]:

print(pred[0]['pred'][1])
print(pred[0]['meta'][1])

0.7547897100448608