In [31]:
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from MKLpy.algorithms import GRAM, MEMO, RMKL
from MKLpy.scheduler  import ReduceOnWorsening
from MKLpy.callbacks  import EarlyStopping, Monitor
from MKLpy import generators
from MKLpy.preprocessing import kernel_normalization, normalization, rescale_01, rescale, centering
from MKLpy.model_selection import train_test_split as mkl_train_test_split, cross_val_score

from imblearn.under_sampling import RandomUnderSampler

In [51]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.dummy import DummyClassifier

In [2]:
with open('data/features.json.pickle', 'rb') as f:
    d = pickle.load(f)

In [3]:
len(d)

44279

# Format data

## Features

In [12]:
d[0]

{'premises': [{'text': 'Why is it that so-called christians, Because there is no such a thing as a christian, Have serious trouble as READING and COMPREHENDING? Its not that difficult, Nor is it that hard. It was stated unto you a very simple "* "You are asking why God would forgive the murderer. " OK we"re done. You paid absolutely no attention whatsoever to the verses presented and instead went off into your own la la land. " But nah, All you did was babble on and on and on. So in this sense, It was YOU that forfeited. Sheesh! Bye.',
   'stance': 'PRO',
   'sentences': ['Why is it that so-called christians, Because there is no such a thing as a christian, Have serious trouble as READING and COMPREHENDING?',
    'Its not that difficult, Nor is it that hard.',
    'It was stated unto you a very simple "* "You are asking why God would forgive the murderer. "',
    'OK we"re done.',
    'You paid absolutely no attention whatsoever to the verses presented and instead went off into your ow

In [4]:
def blow_up_references(ref_arr, length):
    arr = np.full(length, -1)
    for i in ref_arr:
        arr[i] = 1
    return arr

In [5]:
X = list()
y = list()

for argument in tqdm(d):
    premise = argument['premises'][0]
    n = len(premise['sentences'])
    labels = blow_up_references(argument['reference'], n)
    for i in range(n):
        x = np.full(5, 0)
        x[0] = premise['position'][i]
        x[1] = premise['word_counts'][i]
        x[2] = premise['noun_counts'][i]
        x[4] = premise['tfisf'][i]
        x[5] = premise['lr'][i]
        X.append(x)
        y.append(labels[i])
        
X = np.array(X)
y = np.array(y)

100%|██████████████████████████████████████████████████████████████████████████| 44279/44279 [00:06<00:00, 6458.39it/s]


In [6]:
X.shape, y.shape

((1063129, 5), (1063129,))

scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

X

## Balancing
We have high imbalance, and too much data to fit in memory, thus undersampling fits neatly in the setting.

In [7]:
Counter(y)

Counter({-1: 976553, 1: 86576})

In [8]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

In [9]:
X_res.shape, y_res.shape

((173152, 5), (173152,))

In [10]:
Counter(y_res)

Counter({-1: 86576, 1: 86576})

In [12]:
orig_size = len(X)
res_size = len(X_res)
print(f'Reduction from {orig_size} to {res_size} ({(orig_size-res_size)/orig_size})')

Reduction from 1063129 to 173152 (0.8371298309048102)


In [13]:
indices = list(range(res_size))
selected_indices = np.random.choice(indices, 1000)

In [14]:
X_res = np.take(X_res, selected_indices, axis=0)
y_res = np.take(y_res, selected_indices, axis=0)
X_res.shape, y_res.shape

((1000, 5), (1000,))

In [15]:
X_res = rescale_01(X_res)

In [16]:
X_res

tensor([[1.0000, 0.0522, 0.1129, 0.0000, 0.0000],
        [1.0000, 0.0672, 0.0806, 0.0000, 0.0000],
        [1.0000, 0.0522, 0.0968, 0.0000, 0.0000],
        ...,
        [1.0000, 0.0858, 0.0806, 0.0000, 0.5000],
        [1.0000, 0.1455, 0.1774, 0.0000, 0.0000],
        [1.0000, 0.0149, 0.0323, 0.0000, 0.0000]], dtype=torch.float64)

In [65]:
Counter(y_res)

Counter({1: 499, -1: 501})

# Training

## Kernels

In [18]:
KL = generators.RBF_generator(X_res, gamma = [.001, .01, .1], cache=False)

In [19]:
KL = [kernel_normalization(K) for K in KL]

In [24]:
KLtr, KLte, ytr, yte = mkl_train_test_split(KL, y_res, test_size=.3, random_state=42)

In [25]:
monitor = Monitor()

In [26]:
earlystop = EarlyStopping(
    KLte, yte,      #validation data, KL is a validation kernels list
    patience=5,     #max number of acceptable negative steps
    cooldown=1,     #how ofter we run a measurement, 1 means every optimization step
    metric='roc_auc',#the metric we monitor, roc_auc or accuracy
)

In [27]:
#ReduceOnWorsening automatically reduces the 
#learning rate when a worsening solution occurs
scheduler = ReduceOnWorsening()

In [55]:
def scr(estimator, X):
    y_pred = estimator.predict(KLte)
    print(f'f1_score\t: {f1_score(yte, y_pred)}')
    print(f'accuracy\t: {accuracy_score(yte, y_pred)}')
    print(f'recall\t\t: {recall_score(yte, y_pred)}')
    print(f'precision\t: {precision_score(yte, y_pred)}')

In [None]:
mkl = GRAM(
    max_iter=1000,          
    learning_rate=.01,      
    callbacks=[earlystop, monitor],
    scheduler=scheduler
).fit(KLtr, ytr)

In [59]:
memo = MEMO(
    theta = 10.0,
    min_margin = 1e-4,
    solver = 'auto',
    callbacks=[earlystop, monitor],
    scheduler = scheduler
).fit(KLtr, ytr)

scr(memo, KLte)

torch.Size([3, 3])
torch.Size([3]) torch.Size([700]) torch.Size([3]) torch.Size([700]) torch.Size([700, 700]) torch.Size([3, 3]) 3
ok
f1_score	: 0.6894409937888198
accuracy	: 0.6666666666666666
recall		: 0.6727272727272727
precision	: 0.7070063694267515


In [64]:
rmkl = RMKL(
    C = 1.0,
    callbacks=[earlystop, monitor],
    scheduler = scheduler
)
rmkl.fit(KLtr, ytr)

scr(rmkl, KLte)

f1_score	: 0.5322580645161291
accuracy	: 0.6133333333333333
recall		: 0.4
precision	: 0.7951807228915663


In [36]:
scores = cross_val_score(KLtr, ytr, memo, n_folds=3, scoring='accuracy')

torch.Size([3, 3])
torch.Size([3]) torch.Size([466]) torch.Size([3]) torch.Size([466]) torch.Size([466, 466]) torch.Size([3, 3]) 3
ok


RuntimeError: size mismatch, get 300, 300x700,466