# **_Train Concept Classifier_**

In [1]:
from pathlib import Path
import sys 
import yaml
src_dir = Path.cwd().parent
sys.path.append(str(src_dir))
import pandas as pd
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from src.sc_classifier.config.core import config, PACKAGE_ROOT
from src.sc_classifier.trainer import Trainer
from src.sc_classifier.models import constructor
from src.utils import dict2dot

# Read params file
with open(PACKAGE_ROOT / 'params.yaml') as o:
    params = dict2dot(yaml.safe_load(o))
    
# mutate train args to fit
config.app_config.package_name = params.concept_train.package_name
config.train_args.load_pretrained = True 
config.model_config.target = "concept_class"
config.model_config.classes = params['concept_train']['classes']
config.train_args = params.concept_train
config.train_args.metric_dir = "concept_metrics"
config.train_args.max_stratify = 500
config.train_args.prune_stratify = True
trainer = Trainer(
        loss_function=CrossEntropyLoss() , 
        optimizer=AdamW,
        load_data=False,
        model_name= "concept_model",
        config = config,
)

root==> /notebooks/inferess-relation-extraction


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Downloading (…)lve/main/config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

2023-10-02 08:27:51,311 — SCClassifier — INFO — loading checkpoint from `concept_model`


Downloading (…)okenizer_config.json:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/227k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

2023-10-02 08:27:56,216 — SCClassifier — INFO — inference mode...


In [37]:
def _predict_top2(trainer, text):
    scores, labels = trainer.predict(text)
    # calc concepts scores
    classes_scores  =  list(map(lambda x: {trainer.model.config.id2label[k]:v for k,v in enumerate(x)} , scores))
    soreted_scores = list(map(lambda x: sorted(x.items(), key=lambda x: x[1], reverse=True) , classes_scores))
    # top_2 cocepts
    top_2 = list(map(lambda x: (x[0][0], x[1][0]) , soreted_scores ))
    # 1st prediction
    predictions = list(map(lambda x: trainer.model.config.id2label[x], labels))
    return pd.DataFrame({"text":text,
                  "predictions":predictions,
                  "top_2": top_2,
                  "score":list(scores.max(1)),
                  "scores_dist": soreted_scores})

# **_Read text sequences_**

In [2]:
text = pd.read_json(src_dir / "data/train/valid.json").drop_duplicates(['orig_sents'])['orig_sents']

In [33]:
neg = pd.read_excel(src_dir / "data/tasks/test_neg_relations_report_sentences.xlsx")
neg.dropna(inplace=True)
neg.reset_index(drop=True, inplace=True) 
pos = pd.read_excel(src_dir / "data/tasks/test_pos_relations_report_sentences.xlsx")
pos.dropna(inplace=True)
pos.reset_index(drop=True, inplace=True) 

In [38]:
pos_preds = _predict_top2(trainer,list(pos['sentence']))
pos['concept_class'] = pos_preds['predictions']
pos['top2concepts'] = pos_preds['top_2']
pos['concept_score'] = pos_preds['score']
pos_errors = pos[pos['old_new_match'] == False]
pos_errors_concepts  = pos_errors.concept_class.value_counts()

100%|[32m██████████[0m| 592/592 [06:15<00:00,  1.58batch/s]


In [83]:
neg_preds = _predict_top2(trainer,list(neg['sentence']))
neg['concept_class'] = neg_preds['predictions']
neg['top2concepts'] = neg_preds['top_2']
neg['concept_score'] = neg_preds['score']

neg_errors = neg[neg['old_new_match'] == False]
neg_errors_concepts  = neg_errors.concept_class.value_counts()


In [104]:
neg.to_excel(src_dir / "data/tasks/test_neg_relations_report_sentences_w_concepts.xlsx")

In [105]:
pos.to_excel(src_dir / "data/tasks/test_pos_relations_report_sentences_w_concepts.xlsx")

In [103]:
from pprint import pprint
print("\nPOS: Distribution of Concepts over errors\n")
pos_dist = {k:{"count":x, "frac":round(y,2)} for k,x,y in \
            list(zip(pos_errors_concepts.index,
                     pos_errors_concepts,
                     pos_errors_concepts / len(pos_errors)))}

pos_dist = dict(sorted(pos_dist.items(), key=lambda x : x[1]['count'], reverse=True))
pprint(pos_dist,sort_dicts=False)

print("\nNEG: Distribution of Concepts over errors\n")
neg_dist = {k:{"count":x, "frac":round(y,2)} for k,x,y in \
            list(zip(neg_errors_concepts.index,
                     neg_errors_concepts,
                     neg_errors_concepts / len(neg_errors)))}

neg_dist = dict(sorted(neg_dist.items(), key=lambda x : x[1]['count'], reverse=True))
pprint(neg_dist,sort_dicts=False)


POS: Distribution of Concepts over errors

{'supply_chain': {'count': 1058, 'frac': 0.23},
 'licensing_and_ip': {'count': 871, 'frac': 0.19},
 'agreement_and_partnership': {'count': 706, 'frac': 0.16},
 'unknown': {'count': 447, 'frac': 0.1},
 'revenue': {'count': 420, 'frac': 0.09},
 'product_related': {'count': 297, 'frac': 0.07},
 'investment_related': {'count': 242, 'frac': 0.05},
 'services agreement': {'count': 173, 'frac': 0.04},
 'royalties': {'count': 97, 'frac': 0.02},
 'financial_statements': {'count': 96, 'frac': 0.02},
 'real_estate': {'count': 64, 'frac': 0.01},
 'legal_and_regulatory': {'count': 54, 'frac': 0.01}}

NEG: Distribution of Concepts over errors

{'agreement_and_partnership': {'count': 1486, 'frac': 0.24},
 'unknown': {'count': 818, 'frac': 0.13},
 'revenue': {'count': 812, 'frac': 0.13},
 'supply_chain': {'count': 740, 'frac': 0.12},
 'licensing_and_ip': {'count': 706, 'frac': 0.11},
 'investment_related': {'count': 343, 'frac': 0.05},
 'services agreement':