In [None]:
%load_ext autoreload
%autoreload 2

In [4]:
from utills import get_files_in_dir, get_file_nums, load_preprocessed_paras, paragraph_indecies, load_ground_truth_from_nums, order_files_multi_years
from sklearn.preprocessing import StandardScaler
from features import get_transformer
from tqdm.auto import tqdm
import numpy as np
from sklearn.linear_model import SGDClassifier, LogisticRegression
import pickle
from utills import cartesian_product
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV

In [5]:
PREPROCESSED_DIR = 'preprocessed_data/'
GROUND_TRUTH_BASE_DIR = 'data/pan2021/train/'
MODEL_PATH = 'temp_data/model.p'

Load Data
===

In [6]:
file_names_2021 = get_files_in_dir(PREPROCESSED_DIR + '2021/train/')
file_nums_2021 = order_files_multi_years(file_names_2021, '2021')

# file_names_2020_narrow = get_files_in_dir(PREPROCESSED_DIR + '2020/train/dataset-narrow/')
# file_nums_2020_narrow = order_files_multi_years(file_names_2020_narrow, '2020_narrow')

# file_names_2020_wide = get_files_in_dir(PREPROCESSED_DIR + '2020/train/dataset-wide/')
# file_nums_2020_wide = order_files_multi_years(file_names_2020_wide, '2020_wide')

# file_names_2019 = get_files_in_dir(PREPROCESSED_DIR + '2019/train/')
# file_nums_2019 = order_files_multi_years(file_names_2021, '2019')

In [7]:
all_file_names = file_names_2021 
all_preprocessed_paras = load_preprocessed_paras(all_file_names)
paragraph_indecies_dict = paragraph_indecies(all_file_names)

In [7]:
ground_truth_2021 = load_ground_truth_from_nums(
    GROUND_TRUTH_BASE_DIR + 'pan2021/train/', 
    [fn.replace('_2021', '') for fn in file_nums_2021]
)

# ground_truth_2019 = load_ground_truth_from_nums(
#     GROUND_TRUTH_BASE_DIR + 'pan2019/train/', 
#     [fn.replace('_2019', '') for fn in file_nums_2019]
# )

# ground_truth_2020_wide = load_ground_truth_from_nums(
#     GROUND_TRUTH_BASE_DIR + 'pan2020/train/dataset-wide/', 
#     [fn.replace('_2020_wide', '') for fn in file_nums_2020_wide]
# )

# ground_truth_2020_narrow = load_ground_truth_from_nums(
#     GROUND_TRUTH_BASE_DIR + 'pan2020/train/dataset-narrow/', 
#     [fn.replace('_2020_narrow', '') for fn in file_nums_2020_narrow]
# )

In [8]:
ground_truth = ground_truth_2021 

Fit Transformers
===

In [9]:
def fit_transformers(docs):
    transformer = get_transformer()
    scaler = StandardScaler()

    X = transformer.fit_transform(docs).todense()
    X = scaler.fit_transform(X)
    return X, transformer, scaler


def diff_vectors(file_paragraph_indecies,feature_matrix):
    """
        Given a feature matrix and a dictionary with the paragraph indecies per file,
        return a matrix with the diff between adjacent paragraphs/vectors.
    """
    X_diff = []

    for k in tqdm(file_paragraph_indecies.keys()):
        first = file_paragraph_indecies[k][0]
        last_file = file_paragraph_indecies[k][1]

        for i in range(first,last_file-1):
            #diff = feature_matrix[i] - feature_matrix[i+1]
            diff = feature_matrix[i+1] - feature_matrix[i]
            diff = np.abs(diff)
            X_diff.append(diff)
            
    return X_diff

def comparison_idxs(n):
    ret = []
    for i in range(n):
        for j in range(i):
            ret.append([i, j])
    return np.array(ret)


def generate_training_pairs(paragraph_indecies_dict, X):
    X_diff = []
    Y = []
    for i, (k, (s, e)) in enumerate(paragraph_indecies_dict.items()):
        num_paras = e - s
        para_authors = np.array(ground_truth[i]['paragraph-authors'])
        para_idxs = np.arange(s, e)
        idxs = comparison_idxs(num_paras)
        y = para_authors[idxs[:, 0]] != para_authors[idxs[:, 1]]
        x = np.abs(X[para_idxs[idxs[:, 0]]] - X[para_idxs[idxs[:, 1]]])

        Y.extend(y)
        X_diff.extend(x)

    X_diff = np.array(X_diff)
    Y = np.array(Y)
    return X_diff, Y

In [None]:
X, transformer, primary_scaler = fit_transformers(all_preprocessed_paras)
X_diff = diff_vectors(paragraph_indecies_dict, X)
Y = []
for gt in ground_truth:
    Y.extend(gt['changes'])
    
assert len(Y) == len(X_diff)

X_diff_2, Y_2 = generate_training_pairs(paragraph_indecies_dict, X)


In [13]:
XX = np.concatenate([X_diff, X_diff_2])
YY = np.concatenate([Y, Y_2])

# XX = X_diff
# YY = Y

In [14]:
# Fit secondary scaler
secondary_scaler = StandardScaler()
XX = secondary_scaler.fit_transform(XX)

Train the classifier
===

In [None]:
param_dist = {'C': loguniform(1e-4, 1e0)}

clf = LogisticRegression(class_weight='balanced', max_iter=1000)
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, verbose=2)
random_search.fit(XX, YY)

In [23]:
clf = random_search.best_estimator_
random_search.best_params_

{'C': 0.005960555748827062}

In [16]:
feature_names = np.array(transformer.get_feature_names())
feature_names[np.argsort(-np.abs(clf.coef_[0]))][:20]

array(['char_distr__you', 'special_char_distr__?', 'char_distr__ yo',
       'char_distr__ i ', 'char_distr__ur ', "special_char_distr__'",
       'freq_func_words__your', 'masked_stop_words_distr__i',
       'pos_tag_distr__PRP$', 'char_distr__yo',
       'pos_tag_stats__tag_word_length_PRP$',
       'masked_stop_words_distr__would', 'masked_stop_words_distr__my',
       'freq_func_words__in', 'char_distr__?',
       'masked_stop_words_distr__your', 'char_distr__d. ',
       'special_char_distr__"', 'special_char_distr__.',
       'freq_func_words__to'], dtype='<U43')

In [17]:
with open(MODEL_PATH, 'wb') as f:
    pickle.dump((
        transformer,
        primary_scaler,
        secondary_scaler,
        clf
    ), f)