In [23]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv("../data/docee/all/beam.aug_title.tok_notitle.csv")
df.head()

Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
0,0,Vietnam's Communist Party Wednesday re-elected...,Vietnam reelects conservative Nguyễn Phú Trọng...,Government Job change - Election,January 2016,,"['Vietnam', ""'s"", 'Communist', 'Party', 'Wedne..."
1,1,Another 43 people were injured when the bus ca...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,,"['Another', '43', 'people', 'were', 'injured',..."
2,2,At least 27 migrants have died off the Turkish...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,,"['At', 'least', '27', 'migrants', 'have', 'die..."
3,3,"Colten Treu, 21, and his roommate both told au...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,,"['Colten', 'Treu', ',', '21', ',', 'and', 'his..."
4,4,Bolivian President Evo Morales has resigned af...,"Hours after the announcement, Morales resigns ...",Government Job change - Resignation_Dismissal,November 2019,,"['Bolivian', 'President', 'Evo', 'Morales', 'h..."


In [3]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from src.utils import identity
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = make_pipeline(
    TfidfVectorizer(
        tokenizer=identity,
        preprocessor=identity,
        min_df=3,
        max_df=0.95,
        ngram_range=(1,3),
        lowercase=True
    ),
    LinearSVC(verbose=True),
    verbose=True
)

In [12]:
# first we need to split the data according to aug/nonaug
df_noaug = df.loc[df.source_doc_id.isna(), :]
df_aug = df.loc[~df.source_doc_id.isna(), :]

print(f"{len(df_noaug) = }")
print(f"{len(df_aug) = }")
assert len(df_noaug) + len(df_aug) == len(df)

len(df_noaug) = 21949
len(df_aug) = 34110


In [5]:
len(df)

56059

In [13]:
from src.data import subsample_unique_text
# for simplicity, let's extract only one augmented example per source_doc_id
df_aug = subsample_unique_text(df_aug)
df = pd.concat((df_noaug, df_aug), ignore_index=True)
df.reset_index(drop=True, inplace=True)
df.drop(columns="id", inplace=True)
df.reset_index(names="id", inplace=True)
df.head()


Unnamed: 0,id,text,title,event_type,date,source_doc_id,tokens
0,0,Vietnam's Communist Party Wednesday re-elected...,Vietnam reelects conservative Nguyễn Phú Trọng...,Government Job change - Election,January 2016,,"['Vietnam', ""'s"", 'Communist', 'Party', 'Wedne..."
1,1,Another 43 people were injured when the bus ca...,At least 42 people are killed in a bus crash i...,Road Crash,October 2006,,"['Another', '43', 'people', 'were', 'injured',..."
2,2,At least 27 migrants have died off the Turkish...,At least 27 migrants die in a shipwreck in the...,Shipwreck,February 2016,,"['At', 'least', '27', 'migrants', 'have', 'die..."
3,3,"Colten Treu, 21, and his roommate both told au...",Colten Treu faces charges of vehicular homicid...,Road Crash,November 2018,,"['Colten', 'Treu', ',', '21', ',', 'and', 'his..."
4,4,Bolivian President Evo Morales has resigned af...,"Hours after the announcement, Morales resigns ...",Government Job change - Resignation_Dismissal,November 2019,,"['Bolivian', 'President', 'Evo', 'Morales', 'h..."


In [14]:
print(f"{len(df) = }")
print(f"{df.id.nunique() = }")

len(df) = 51803
df.id.nunique() = 51803


In [15]:
for i, j in zip(df.index.values, df.index.values[1:]):
    if j != i+1:
        raise ValueError(f"{i = }; {j = }")
for i, j in zip(df.id.values, df.id.values[1:]):
    if j != i+1:
        raise ValueError(f"{i = }; {j = }")


In [16]:
from src.data import custom_kfold

ckf = custom_kfold(n_splits=5, df=df)

In [17]:
train_idx, test_idx = next(ckf)

len(df_noaug) = 21949
len(df_aug) = 29854
test ids range from 2 to 21941
Source doc ids in aug range from 1.0 to 21948.0
From 29854 examples in df_aug, retained only 23856 for which source is not in test set.


In [18]:
print(f"{min(train_idx) = }")
print(f"{max(train_idx) = }")
print(f"{min(test_idx) = }")
print(f"{max(test_idx) = }")

min(train_idx) = 0
max(train_idx) = 51802
min(test_idx) = 2
max(test_idx) = 21941


In [19]:
print(f"{min(df.index) = }")
print(f"{max(df.index) = }")
print(f"{df.index.nunique() = }")

min(df.index) = 0
max(df.index) = 51802
df.index.nunique() = 51803


In [20]:
print(f"{len(df.iloc[train_idx]) =}")
print(f"{len(df.iloc[test_idx]) =}")


len(df.iloc[train_idx]) =41415
len(df.iloc[test_idx]) =4390


In [21]:
ckf = custom_kfold(n_splits=5, df=df)

In [22]:
from ast import literal_eval
from sklearn.model_selection import cross_validate

df.loc[:, "tokens"] = df.tokens.apply(literal_eval)

scores = cross_validate(
    pipeline,
    df.tokens.values, df.event_type.values,
    scoring="f1_macro",
    cv=custom_kfold(n_splits=2, df=df)
)
scores

len(df_noaug) = 21949
len(df_aug) = 29854
test ids range from 0 to 21945
Source doc ids in aug range from 1.0 to 21948.0
From 29854 examples in df_aug, retained only 14885 for which source is not in test set.
test ids range from 3 to 21948
Source doc ids in aug range from 1.0 to 21948.0
From 29854 examples in df_aug, retained only 14969 for which source is not in test set.
[Pipeline] ... (step 1 of 2) Processing tfidfvectorizer, total=  24.4s
[LibLinear]...*
optimization finished, #iter = 36
Objective value = -132.664163
nSV = 1876
...*
optimization finished, #iter = 35
Objective value = -577.371007
nSV = 4172
...*
optimization finished, #iter = 36
Objective value = -204.493715
nSV = 7159
...*
optimization finished, #iter = 35
Objective value = -132.198742
nSV = 5069
...*
optimization finished, #iter = 35
Objective value = -144.648382
nSV = 4861
...*
optimization finished, #iter = 36
Objective value = -422.538625
nSV = 7239
...*
optimization finished, #iter = 35
Objective value = -229.

{'fit_time': array([75.60204053, 73.72294497]),
 'score_time': array([23.10430622, 13.39590716]),
 'test_score': array([0.78684909, 0.78793486])}

In [24]:
np.mean(scores["test_score"]), np.var(scores["test_score"])

(0.7873919746221218, 2.947251547254414e-07)

In [28]:
from typing import Callable


# okay now we can try filtering only the duplicate examples
def subsample_aug(
        df: pd.DataFrame,
        subsampler: Callable[[pd.DataFrame], pd.DataFrame]
):
    # split into aug and non-aug
    df_aug = df.loc[~df.source_doc_id.isna(), :]
    df_source = df.loc[df.source_doc_id.isna(), :]

    # subsample the df_aug
    df_aug = subsampler(df_aug)

    # concatenate, ignoring the index
    df = pd.concat((df_source, df_aug), ignore_index=True)

    # normalize id column

    df.drop(columns=["id"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.reset_index(names="id", inplace=True)

    return df

In [32]:
def subsample_one_per_source(df: pd.DataFrame):
    return df.groupby("source_doc_id").sample(1)

def subsample_unique_text(df: pd.DataFrame):
    return df.groupby("text").sample(1)

In [39]:
df_ops = subsample_aug(df, subsample_one_per_source)
df_ops.head()
print(f"{len(df_ops) = }")

len(df_ops) = 33319


In [40]:
df_unique_text = subsample_aug(df, subsample_unique_text)
df_unique_text.head()
print(f"{len(df_unique_text) = }")

len(df_unique_text) = 51803


In [41]:
df_all = subsample_aug(df, identity)
df_all.head()
print(f"{len(df_all) = }")

len(df_all) = 56059
