In [42]:
import pandas as pd

In [43]:
df = pd.read_csv("../data/docee/all/beam.aug_title.tok_notitle.csv")
df.head()

Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
0,0,Vietnam's Communist Party Wednesday re-elected...,Vietnam reelects conservative Nguyễn Phú Trọng...,Government Job change - Election,January 2016,,"['Vietnam', ""'s"", 'Communist', 'Party', 'Wedne..."
1,1,Another 43 people were injured when the bus ca...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,,"['Another', '43', 'people', 'were', 'injured',..."
2,2,At least 27 migrants have died off the Turkish...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,,"['At', 'least', '27', 'migrants', 'have', 'die..."
3,3,"Colten Treu, 21, and his roommate both told au...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,,"['Colten', 'Treu', ',', '21', ',', 'and', 'his..."
4,4,Bolivian President Evo Morales has resigned af...,"Hours after the announcement, Morales resigns ...",Government Job change - Resignation_Dismissal,November 2019,,"['Bolivian', 'President', 'Evo', 'Morales', 'h..."


In [44]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from src.utils import identity
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = make_pipeline(
    TfidfVectorizer(
        tokenizer=identity,
        preprocessor=identity,
        min_df=3,
        max_df=0.95,
        ngram_range=(1,3),
        lowercase=True
    ),
    LinearSVC(verbose=True),
    verbose=True
)

In [45]:
# first we need to split the data according to aug/nonaug
df_noaug = df.loc[df.source_doc_id.isna(), :]
df_aug = df.loc[~df.source_doc_id.isna(), :]
assert len(df_noaug) + len(df_aug) == len(df)

In [46]:
df_aug.head()

Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
21949,21949,The bus skidded off the road at Luham in Saly...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,1.0,"[' ', 'The', 'bus', 'skidded', 'off', 'the', '..."
21950,21950,The bus came off the road at Luham in Salyan ...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,1.0,"[' ', 'The', 'bus', 'came', 'off', 'the', 'roa..."
21951,21951,The bus skidded off the road at Luham in Saly...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,1.0,"[' ', 'The', 'bus', 'skidded', 'off', 'the', '..."
21952,21952,At least 27 migrants have died off the Turkis...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,2.0,"[' ', 'At', 'least', '27', 'migrants', 'have',..."
21953,21953,At least 27 migrants have died trying to reac...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,2.0,"[' ', 'At', 'least', '27', 'migrants', 'have',..."


In [47]:
# for simplicity, let's extract only one augmented example per source_doc_id
df_oneaug = df_aug.groupby("source_doc_id").sample(1)
df_oneaug.head()

Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
21951,21951,The bus skidded off the road at Luham in Saly...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,1.0,"[' ', 'The', 'bus', 'skidded', 'off', 'the', '..."
21954,21954,At least 27 migrants died off the Turkish coa...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,2.0,"[' ', 'At', 'least', '27', 'migrants', 'died',..."
21956,21956,"Colten Treu, 21, and his roommate, John Stend...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,3.0,"[' ', 'Colten', 'Treu', ',', '21', ',', 'and',..."
21959,21959,At least 23 Thai nationals killed when their ...,At least 26 people are killed and several othe...,Road Crash,December 2010,9.0,"[' ', 'At', 'least', '23', 'Thai', 'nationals'..."
21963,21963,"Christian Kandlbauer, 22, lost both of his ar...",A car crash in Austria leads to the death of C...,Famous Person - Death,October 2010,12.0,"[' ', 'Christian', 'Kandlbauer', ',', '22', ',..."


In [48]:
df = pd.concat((df_noaug, df_oneaug), ignore_index=True)
print(f"{len(df) = }")
df.head()

len(df) = 33319


Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
0,0,Vietnam's Communist Party Wednesday re-elected...,Vietnam reelects conservative Nguyễn Phú Trọng...,Government Job change - Election,January 2016,,"['Vietnam', ""'s"", 'Communist', 'Party', 'Wedne..."
1,1,Another 43 people were injured when the bus ca...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,,"['Another', '43', 'people', 'were', 'injured',..."
2,2,At least 27 migrants have died off the Turkish...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,,"['At', 'least', '27', 'migrants', 'have', 'die..."
3,3,"Colten Treu, 21, and his roommate both told au...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,,"['Colten', 'Treu', ',', '21', ',', 'and', 'his..."
4,4,Bolivian President Evo Morales has resigned af...,"Hours after the announcement, Morales resigns ...",Government Job change - Resignation_Dismissal,November 2019,,"['Bolivian', 'President', 'Evo', 'Morales', 'h..."


In [49]:
df.reset_index(drop=True, inplace=True)
df.drop(columns=["id"], inplace=True)
df.reset_index(names="id", inplace=True)
print(f"{min(df.id) = }")
print(f"{max(df.id) = }")
df.head()

min(df.id) = 0
max(df.id) = 33318


Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
0,0,Vietnam's Communist Party Wednesday re-elected...,Vietnam reelects conservative Nguyễn Phú Trọng...,Government Job change - Election,January 2016,,"['Vietnam', ""'s"", 'Communist', 'Party', 'Wedne..."
1,1,Another 43 people were injured when the bus ca...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,,"['Another', '43', 'people', 'were', 'injured',..."
2,2,At least 27 migrants have died off the Turkish...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,,"['At', 'least', '27', 'migrants', 'have', 'die..."
3,3,"Colten Treu, 21, and his roommate both told au...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,,"['Colten', 'Treu', ',', '21', ',', 'and', 'his..."
4,4,Bolivian President Evo Morales has resigned af...,"Hours after the announcement, Morales resigns ...",Government Job change - Resignation_Dismissal,November 2019,,"['Bolivian', 'President', 'Evo', 'Morales', 'h..."


In [51]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

def custom_kfold(n_splits: int, df):
    df_noaug = df.loc[df.source_doc_id.isna(), ["id", "tokens", "event_type"]]
    df_aug = df.loc[~df.source_doc_id.isna(), ["id", "tokens", "event_type"]]
    skf = StratifiedKFold(n_splits, shuffle=True)
    # we need to yield (train, test) indices
    for (train_idx, test_idx) in skf.split(df_noaug.tokens, df_noaug.event_type):
        # convert indices from [0, n-1] to id
        # train_ids = df_noaug.loc[train_idx, "id"]
        # test_ids = df_noaug.loc[test_idx, "id"]
        #
        # # combine train_ids with ids from df_aug
        # train_ids = pd.concat((train_ids, df_aug.id))
        yield np.concatenate((train_idx, df_aug.index.values)), test_idx

In [52]:
ckf = custom_kfold(n_splits=5, df=df)

In [53]:
train_idx, text_idx = next(ckf)

In [54]:
np.min(train_idx), np.max(train_idx)

(0, 33318)

In [55]:
np.min(text_idx), np.max(text_idx)

(11, 21941)

In [56]:
df.iloc[train_idx]

Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
0,0,Vietnam's Communist Party Wednesday re-elected...,Vietnam reelects conservative Nguyễn Phú Trọng...,Government Job change - Election,January 2016,,"['Vietnam', ""'s"", 'Communist', 'Party', 'Wedne..."
1,1,Another 43 people were injured when the bus ca...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,,"['Another', '43', 'people', 'were', 'injured',..."
2,2,At least 27 migrants have died off the Turkish...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,,"['At', 'least', '27', 'migrants', 'have', 'die..."
3,3,"Colten Treu, 21, and his roommate both told au...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,,"['Colten', 'Treu', ',', '21', ',', 'and', 'his..."
4,4,Bolivian President Evo Morales has resigned af...,"Hours after the announcement, Morales resigns ...",Government Job change - Resignation_Dismissal,November 2019,,"['Bolivian', 'President', 'Evo', 'Morales', 'h..."
...,...,...,...,...,...,...,...
33314,33314,Signs along the banks of the Swan River warni...,Swan River health hangs in balance as climate ...,Environment Pollution,,21943.0,"[' ', 'Signs', 'along', 'the', 'banks', 'of', ..."
33315,33315,Winter storms on England's Suffolk coast have...,Storms Reveal Two Historic Shipwrecks on Engla...,Shipwreck,,21944.0,"[' ', 'Winter', 'storms', 'on', 'England', ""'s..."
33316,33316,US and Macedonian troops have started a 14-da...,Macedonia Hosts Joint Military Exercise With U...,Military Exercise,,21945.0,"[' ', 'US', 'and', 'Macedonian', 'troops', 'ha..."
33317,33317,Mark Labbett is favourite among the British p...,ITV Beat The Chasers: Mark Labbett's marriage ...,Famous Person - Marriage,,21946.0,"[' ', 'Mark', 'Labbett', 'is', 'favourite', 'a..."


In [57]:
df.iloc[text_idx]

Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
11,11,Police have opened three probes involving L'Or...,"Four people, including celebrity photographer ...",Famous Person - Commit Crime - Accuse,July 2010,,"['Police', 'have', 'opened', 'three', 'probes'..."
14,14,"MOSCOW, March 31. /TASS/. The Russian Ministry...",The Russian Ministry of Internal Affairs detai...,Famous Person - Commit Crime - Arrest,March 2018,,"['MOSCOW', ',', 'March', '31', '.', '/TASS/.',..."
19,19,They say women and children were among the dea...,Witnesses and hospital officials say that 22 I...,Armed Conflict,June 2004,,"['They', 'say', 'women', 'and', 'children', 'w..."
23,23,"PC Ben Hannam, 22, becomes the first serving B...",22-year-old Metropolitan Police Constable Ben ...,Famous Person - Commit Crime - Sentence,April 2021,,"['PC', 'Ben', 'Hannam', ',', '22', ',', 'becom..."
25,25,The casualties were travelling on two buses ne...,At least three people have been killed in bomb...,Riot,February 2007,,"['The', 'casualties', 'were', 'travelling', 'o..."
...,...,...,...,...,...,...,...
21921,21921,"Fort McMurray's Welcome Centre, which helps co...",Fort McMurray Welcome Centre carries on despit...,Organization Closed,,,"['Fort', 'McMurray', ""'s"", 'Welcome', 'Centre'..."
21925,21925,The bond between a human being and a dog is un...,Loyal Dog Still Waiting Outside Mexican Coal M...,Mine Collapses,,,"['The', 'bond', 'between', 'a', 'human', 'bein..."
21928,21928,The 1983 IIHF European U18 Championship was th...,1983 IIHF European U18 Championship,Sports Competition,,,"['The', '1983', 'IIHF', 'European', 'U18', 'Ch..."
21938,21938,Ute driver survives crash that turned his Ford...,Ute driver survives crash that turned his Ford...,Road Crash,,,"['Ute', 'driver', 'survives', 'crash', 'that',..."


In [60]:
from ast import literal_eval
from sklearn.model_selection import cross_validate

# df.loc[:, "tokens"] = df.tokens.apply(literal_eval)

scores = cross_validate(
    pipeline,
    df.tokens.values, df.event_type.values,
    scoring="classification_report",
    cv=custom_kfold(n_splits=2, df=df)
)
print(f"{np.mean(scores) = }")
print(f"{np.var(scores) = }")
scores

ValueError: 'classification_report' is not a valid scoring value. Use sklearn.metrics.get_scorer_names() to get valid options.

In [27]:
scores

{'fit_time': array([134.6400187 , 127.5815897 , 124.69714284, 124.12445068,
        124.87236166]),
 'score_time': array([25.40451574, 23.05297184, 23.08283091, 23.11950517, 22.78396249]),
 'test_score': array([0.84827675, 0.84831069, 0.84607286, 0.83410964, 0.84018611])}

In [28]:
from typing import Callable


# okay now we can try filtering only the duplicate examples
def subsample_aug(
        df: pd.DataFrame,
        subsampler: Callable[[pd.DataFrame], pd.DataFrame]
):
    # split into aug and non-aug
    df_aug = df.loc[~df.source_doc_id.isna(), :]
    df_source = df.loc[df.source_doc_id.isna(), :]

    # subsample the df_aug
    df_aug = subsampler(df_aug)

    # concatenate, ignoring the index
    df = pd.concat((df_source, df_aug), ignore_index=True)

    # normalize id column

    df.drop(columns=["id"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.reset_index(names="id", inplace=True)

    return df

In [32]:
def subsample_one_per_source(df: pd.DataFrame):
    return df.groupby("source_doc_id").sample(1)

def subsample_unique_text(df: pd.DataFrame):
    return df.groupby("text").sample(1)

In [39]:
df_ops = subsample_aug(df, subsample_one_per_source)
df_ops.head()
print(f"{len(df_ops) = }")

len(df_ops) = 33319


In [40]:
df_unique_text = subsample_aug(df, subsample_unique_text)
df_unique_text.head()
print(f"{len(df_unique_text) = }")

len(df_unique_text) = 51803


In [41]:
df_all = subsample_aug(df, identity)
df_all.head()
print(f"{len(df_all) = }")

len(df_all) = 56059
