In [1]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
%load_ext autoreload
%autoreload 2
import label_improve as li

In [3]:
keywords = {
        "neutral": [
        "reported",
        "announced",
        "expected",
        "forecast",
        "published",
        "released",
        "unchanged",
        "anticipated"
    ],
    "positive": [
        "net+sales+increased+by",
        "positive+impact",
        "grew+by",
        "increased+by",
        "favourable",
        "up+from",
        "gain",
        "expansion",
        "positive+growth",
        "increased",
        "profit+rose"
    ],
    "negative": [
        "loss",
        "decreased",
        "declined",
        "fell",
        "down+from",
        "dropped",
        "negative",
        "losses",
        "decreased+by",
        "fell+by",
        "be+lower+than",
    ]
}

In [4]:
def check_keywords(text, keywords):
    for keyword in keywords:
        if "+" in keyword:
            sub_keywords = keyword.split("+")
            if all(sub_keyword in text.lower() for sub_keyword in sub_keywords):
                return True
        else:
            if keyword in text.lower():
                return True
    return False
@labeling_function()
def extend_neutral(x):
    if check_keywords(x.text, keywords["neutral"]):
        return -1
    elif check_keywords(x.text, keywords["positive"]):
        return -1
    elif check_keywords(x.text, keywords["negative"]):
        return -1
    return 0

In [5]:
# Loading the data 
dataset_name = "finbank"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

In [6]:
keywords_by_idx = {int(label_to_idx[k]):v for k,v in keywords.items()}
lfs = li.keywords_to_LFs(keywords_by_idx)
lfs = lfs + [extend_neutral]
train_df= li.df_with_new_lfs(train_df, lfs)
valid_df= li.df_with_new_lfs(valid_df, lfs)
test_df= li.df_with_new_lfs(test_df, lfs)

100%|██████████| 5434/5434 [00:00<00:00, 5644.36it/s]
100%|██████████| 2250/2250 [00:00<00:00, 5711.90it/s]
100%|██████████| 2250/2250 [00:00<00:00, 5307.07it/s]


In [8]:
li.analysis_LFs(lfs, test_df,3)


 25%|██▌       | 564/2250 [00:00<00:00, 5631.87it/s]

100%|██████████| 2250/2250 [00:00<00:00, 5671.19it/s]


Test Coverage: 1.0
acuracy for the not abstains
0.741340530814215
acuracy for all
0.7324444444444445


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.,Conflict Ratio
lf_reported,0,[0],0.009778,0.002667,0.002222,15,7,0.681818,0.227273
lf_announced,1,[0],0.010667,0.001333,0.000889,15,9,0.625,0.083333
lf_expected,2,[0],0.016,0.000889,0.000889,21,15,0.583333,0.055556
lf_forecast,3,[0],0.003111,0.000889,0.000889,2,5,0.285714,0.285714
lf_published,4,[0],0.008444,0.0,0.0,17,2,0.894737,0.0
lf_released,5,[0],0.004,0.000889,0.0,9,0,1.0,0.0
lf_unchanged,6,[0],0.001778,0.0,0.0,4,0,1.0,0.0
lf_anticipated,7,[0],0.001333,0.0,0.0,2,1,0.666667,0.0
lf_net+sales+increased+by,8,[1],0.004,0.004,0.0,9,0,1.0,0.0
lf_positive+impact,9,[1],0.000889,0.0,0.0,2,0,1.0,0.0


In [184]:
# save the dataset
new_train = li.df_to_data(train_df)
new_valid = li.df_to_data(valid_df)
new_test = li.df_to_data(test_df)
li.save_dataset(new_train, f'../weak_datasets/{dataset_name}/train.json')
li.save_dataset(new_valid, f'../weak_datasets/{dataset_name}/valid.json')
li.save_dataset(new_test, f'../weak_datasets/{dataset_name}/test.json')