In [171]:
import numpy as np
import snorkel
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import MajorityLabelVoter
import json
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [172]:
%load_ext autoreload
%autoreload 2
import label_improve as li

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [173]:
keywords = {
        "neutral": [
        "reported",
        "announced",
        "expected",
        "forecast",
        "published",
        "released",
        "unchanged",
        "anticipated"
    ],
    "positive": [
        "net+sales+increased+by",
        "positive+impact",
        "grew+by",
        "increased+by",
        "favourable",
        "up+from",
        "gain",
        "expansion",
        "positive+growth",
        "increased",
        "profit+rose"
    ],
    "negative": [
        "loss",
        "decreased",
        "declined",
        "fell",
        "down+from",
        "dropped",
        "negative",
        "losses",
        "decreased+by",
        "fell+by",
        "be+lower+than",
    ]
}

In [179]:
def check_keywords(text, keywords):
    for keyword in keywords:
        if "+" in keyword:
            sub_keywords = keyword.split("+")
            if all(sub_keyword in text.lower() for sub_keyword in sub_keywords):
                return True
        else:
            if keyword in text.lower():
                return True
    return False
@labeling_function()
def extend_neutral(x):
    if check_keywords(x.text, keywords["neutral"]):
        return -1
    elif check_keywords(x.text, keywords["positive"]):
        return -1
    elif check_keywords(x.text, keywords["negative"]):
        return -1
    return 0

In [180]:
# Loading the data 
dataset_name = "finbank"
idx_to_label = json.load(open(f"../weak_datasets/{dataset_name}/label.json"))
label_to_idx = {l:i for i,l in idx_to_label.items()}
valid_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/valid.json", "r")))
train_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/train.json", "r")))
test_df = li.data_to_df(json.load(open(f"../weak_datasets/{dataset_name}/test.json", "r")))

In [182]:
keywords_by_idx = {int(label_to_idx[k]):v for k,v in keywords.items()}
lfs = li.keywords_to_LFs(keywords_by_idx)
lfs = lfs + [extend_neutral]
train_df= li.df_with_new_lfs(train_df, lfs)
valid_df= li.df_with_new_lfs(valid_df, lfs)
test_df= li.df_with_new_lfs(test_df, lfs)

  0%|          | 0/5434 [00:00<?, ?it/s]

100%|██████████| 5434/5434 [00:00<00:00, 5576.19it/s]
100%|██████████| 2250/2250 [00:00<00:00, 5601.95it/s]
100%|██████████| 2250/2250 [00:00<00:00, 5593.59it/s]


In [183]:
li.analysis_LFs_with_weak_labels(train_df,3)
li.analysis_LFs_with_weak_labels(valid_df,3)
li.analysis_LFs_with_weak_labels(test_df,3)

Test Coverage: 1.0
acuracy for the not abstains
0.7352170658682635
acuracy for all
0.7230401177769599
Test Coverage: 1.0
acuracy for the not abstains
0.7373417721518988
acuracy for all
0.7248888888888889
Test Coverage: 1.0
acuracy for the not abstains
0.741340530814215
acuracy for all
0.7324444444444445


In [184]:
# save the dataset
new_train = li.df_to_data(train_df)
new_valid = li.df_to_data(valid_df)
new_test = li.df_to_data(test_df)
li.save_dataset(new_train, f'../weak_datasets/{dataset_name}/train.json')
li.save_dataset(new_valid, f'../weak_datasets/{dataset_name}/valid.json')
li.save_dataset(new_test, f'../weak_datasets/{dataset_name}/test.json')

In [170]:
# Apply the labeling functions to the dataset
applier = PandasLFApplier(lfs)
L_train = applier.apply(train_df)

# Calculate the coverage and analyze labeling functions
print("Test Coverage:", (L_train != -1).mean(axis=0))

# Calculate LF analysis summary
lf_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary(Y=train_df.label.values)

# Calculate conflict ratio
lf_analysis['Conflict Ratio'] = lf_analysis['Conflicts'] / lf_analysis['Coverage']
lf_analysis


100%|██████████| 5434/5434 [00:01<00:00, 3947.43it/s]


Test Coverage: [1.21457490e-02 1.30658815e-02 1.93227825e-02 5.88884799e-03
 4.96871550e-03 2.57637100e-03 5.52079499e-04 5.52079499e-04
 2.76039750e-03 7.36105999e-04 3.31247700e-03 1.10415900e-02
 1.84026500e-03 4.03018035e-02 5.52079499e-03 3.12845050e-03
 3.68053000e-04 2.48435775e-02 1.38019875e-02 4.08538830e-02
 1.85866765e-02 4.23260950e-03 1.28818550e-02 1.91387560e-02
 2.94442400e-03 4.23260950e-03 1.28818550e-03 6.80898049e-03
 3.49650350e-03 2.02429150e-03 8.33640044e-01]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.,Conflict Ratio
lf_reported,0,[0],0.012146,0.004049,0.004049,31,35,0.469697,0.333333
lf_announced,1,[0],0.013066,0.001472,0.001104,45,26,0.633803,0.084507
lf_expected,2,[0],0.019323,0.002024,0.001472,59,46,0.561905,0.07619
lf_forecast,3,[0],0.005889,0.00092,0.000736,14,18,0.4375,0.125
lf_published,4,[0],0.004969,0.000368,0.0,18,9,0.666667,0.0
lf_released,5,[0],0.002576,0.001288,0.00092,9,5,0.642857,0.357143
lf_unchanged,6,[0],0.000552,0.0,0.0,3,0,1.0,0.0
lf_anticipated,7,[0],0.000552,0.0,0.0,3,0,1.0,0.0
lf_net+sales+increased+by,8,[1],0.00276,0.00276,0.000552,15,0,1.0,0.2
lf_positive+impact,9,[1],0.000736,0.000736,0.000736,4,0,1.0,1.0
