In [1]:
from collections import defaultdict, Counter
import itertools
import re
import subprocess
import pickle
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn import linear_model, preprocessing, utils, datasets
from sklearn.metrics import accuracy_score, matthews_corrcoef
from matplotlib.pyplot import scatter
import random
from tqdm.notebook import tqdm
from stats_count import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Parameters

In [3]:
np.random.seed(42)
random.seed(42)

In [4]:
max_examples_to_train = 10**10
max_tokens_amount  = 128 # The number of tokens to which the tokenized text is truncated / padded.
layers_of_interest = [i for i in range(12)]  # Layers for which attention matrices and features on them are 
                                             # calculated. For calculating features on all layers, leave it be
                                             # [i for i in range(12)].

In [5]:
train_subset = "valid_5k"
test_subset  = "valid_5k" # dev/valid - for hyperparameters tuning;
                      # test - for final testing after tuning hyperparameters on the dev set.
input_dir = "small_gpt_web/"   # Name of the directory with .csv file
model_path = "bert-base-uncased"
# You can use either standard or fine-tuned BERT. If you want to use fine-tuned BERT to your current task, save the
# model and the tokenizer with the commands tokenizer.save_pretrained(output_dir); 
# bert_classifier.save_pretrained(output_dir) into the same directory and insert the path to it here.

old_f_train_file  = input_dir + "features/" + train_subset + \
                    "_all_heads_12_layers_s_e_v_c_b0b1_lists_array_6_thrs_MAX_LEN_128_bert-base-uncased.npy"
old_f_test_file   = input_dir + "features/" + test_subset + \
                    "_all_heads_12_layers_s_e_v_c_b0b1_lists_array_6_thrs_MAX_LEN_128_bert-base-uncased.npy"
ripser_train_file = input_dir + "features/" + train_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_test_file = input_dir + "features/" + test_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
templ_train_file  = input_dir + "features/" + train_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_template.npy"
templ_test_file   = input_dir + "features/" + test_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_template.npy"

# Higher homology features of embeddings.
ripser_euc_final_train_file = input_dir + "features/euc_final_" + train_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_euc_final_test_file = input_dir + "features/euc_final_" + test_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_sph_final_train_file = input_dir + "features/sph_final_" + train_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_sph_final_test_file = input_dir + "features/sph_final_" + test_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_euc_start_train_file = input_dir + "features/euc_start_" + train_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_euc_start_test_file = input_dir + "features/euc_start_" + test_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_sph_start_train_file = input_dir + "features/sph_start_" + train_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"
ripser_sph_start_test_file = input_dir + "features/sph_start_" + test_subset + \
                    "_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_ripser.npy"

In [6]:
solver  = "lbfgs"
is_dual = False

## Loading data and features

In [7]:
try:
    train_data = pd.read_csv(input_dir + train_subset + ".csv")
    test_data = pd.read_csv(input_dir + test_subset + ".csv")
except:
    train_data = pd.read_csv(input_dir + train_subset + ".tsv", delimiter="\t", header=None)
    train_data.columns = ["0", "labels", "2", "sentence"]
    test_data = pd.read_csv(input_dir + test_subset + ".tsv", delimiter="\t")

In [8]:
if "label" in train_data.columns:
    train_data["labels"] = (train_data["label"] == "generated").astype(int)
    
if "label" in test_data.columns:
    test_data["labels"] = (test_data["label"] == "generated").astype(int)

In [9]:
y_test = list(map(int, test_data["labels"]))
test_data

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label,labels
0,582,250590,False,1024,"2. Enbridge's XL Pipeline to the I-93, Cut Wes...",generated,1
1,3466,253542,True,829,comic book: George Linder was barely able to f...,natural,0
2,3568,253645,True,872,it's really wonky.\n\nThe first thing I asked...,generated,1


Requirements for .csv files:

* .csv file **train_data** must contain the column named **labels**, otherwise LogReg accuracy will not be estimated.

In [10]:
old_features_train = np.load(old_f_train_file, allow_pickle=True)[:,:,:,:max_examples_to_train,:]
old_features_test  = np.load(old_f_test_file, allow_pickle=True)[:,:,:,:max_examples_to_train,:]
old_features_train.shape

(12, 12, 6, 3, 6)

In [11]:
ripser_train = np.load(ripser_train_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_test  = np.load(ripser_test_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_train.shape

(12, 12, 3, 14)

In [12]:
ripser_euc_final_train = np.load(ripser_euc_final_train_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_euc_final_test  = np.load(ripser_euc_final_test_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_euc_final_train.shape

(1, 1, 3, 42)

In [13]:
ripser_sph_final_train = np.load(ripser_sph_final_train_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_sph_final_test  = np.load(ripser_sph_final_test_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_sph_final_train.shape

(1, 1, 3, 42)

In [14]:
ripser_euc_start_train = np.load(ripser_euc_start_train_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_euc_start_test  = np.load(ripser_euc_start_test_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_euc_start_train.shape

(1, 1, 3, 42)

In [15]:
ripser_sph_start_train = np.load(ripser_sph_start_train_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_sph_start_test  = np.load(ripser_sph_start_test_file, allow_pickle=True)[:,:,:max_examples_to_train,:]
ripser_sph_start_train.shape

(1, 1, 3, 42)

In [16]:
templ_train = np.load(templ_train_file, allow_pickle=True)[:,:,:,:max_examples_to_train]
templ_test  = np.load(templ_test_file, allow_pickle=True)[:,:,:,:max_examples_to_train]
templ_train.shape

(12, 12, 6, 3)

In [17]:
X_train = []
for i in range(len(train_data)):
    features = np.concatenate((old_features_train[:,:,:,i,:].flatten(),
                               ripser_train[:,:,i,:].flatten(),
                               ripser_euc_final_train[:,:,i,:].flatten(),
                               ripser_euc_start_train[:,:,i,:].flatten(),
                               ripser_sph_final_train[:,:,i,:].flatten(),
                               ripser_sph_start_train[:,:,i,:].flatten(),
                               templ_train[:,:,:,i].flatten()))
    X_train.append(features)
y_train = train_data["labels"]

X_test = []
for i in range(len(test_data)):
    features = np.concatenate((old_features_test[:,:,:,i,:].flatten(),
                               ripser_test[:,:,i,:].flatten(),
                               ripser_euc_final_test[:,:,i,:].flatten(),
                               ripser_euc_start_test[:,:,i,:].flatten(),
ripser_sph_final_test[:,:,i,:].flatten(),
ripser_sph_start_test[:,:,i,:].flatten(),
                               templ_test[:,:,:,i].flatten()))
    X_test.append(features)
y_test = test_data["labels"]

In [18]:
X_train = X_train[:max_examples_to_train]
train_data = train_data[:max_examples_to_train]

In [19]:
try:
    assert(len(train_data) == len(X_train))
    assert(len(test_data) == len(X_test))
except:
    print("ASSERTION ERROR!!!")

## LogReg by the feautres from all heads

In [20]:
def pred_by_Xy(X_train, y_train, X_test, classifier, verbose=False, scale=True):

    if scale:
        scaler  = preprocessing.StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

    classifier.fit(X_train, y_train)
    
    if verbose:
        print("train matt:", matthews_corrcoef(y_train, classifier.predict(X_train)))
        print("train acc: ", accuracy_score(y_train, classifier.predict(X_train)))
    
    if scale:
        X_test = scaler.transform(X_test)
        
    return classifier.predict(X_test), \
           matthews_corrcoef(y_train, classifier.predict(X_train)), \
           accuracy_score(y_train, classifier.predict(X_train))

In [21]:
classifier = linear_model.LogisticRegression(solver=solver)

# The classifier with concrete hyperparameters values, which you should insert here.
# For grid search of hyperparameters - see below.

## Grid Search of hyperparameters. Use it on the dev/vaild set!

(**Reminder**: Don't tune hyperparameters on the test set, to not overfit hyperparameters. Tune hyperparameters on the dev/valid set, and then use the best ones on the test set.)

In [22]:
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
max_iter_range = [1, 2, 3, 5, 10, 25, 50, 100, 500, 1000, 2000]

print(C_range, max_iter_range)

[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] [1, 2, 3, 5, 10, 25, 50, 100, 500, 1000, 2000]


In [23]:
matt_scores = dict()
acc_scores  = dict()
matt_scores_train = dict()
acc_scores_train  = dict()
results     = dict()

for C in tqdm(C_range):
    for max_iter in max_iter_range:
        classifier = linear_model.LogisticRegression(penalty='l2', C=C, max_iter=max_iter, dual=is_dual,
                                                     solver=solver)

        result, train_matt, train_acc = pred_by_Xy(X_train, y_train, X_test, classifier)
        results[(C, max_iter)] = result

        matt_scores_train[(C, max_iter)] = matthews_corrcoef(result, y_test)
        acc_scores_train[(C, max_iter)]  = accuracy_score(result, y_test)

        try:
            matt_scores[(C, max_iter)] = matthews_corrcoef(result, y_test)
            acc_scores[(C, max_iter)]  = accuracy_score(result, y_test)
            #print("test matt: ", matthews_corrcoef(result, y_test))
            #print("test acc:  ", accuracy_score(result, y_test))
        except:
            #print("Not labeled")
            pass

  0%|          | 0/10 [00:00<?, ?it/s]

### Prints the list of hyperparameters and corresponding matthews corcoef / accuracy of LogReg, trained with these parameters

In [24]:
try:
    for C in tqdm(C_range):
        for max_iter in max_iter_range:
            print(C, max_iter, ", matt:", matt_scores[(C, max_iter)])
            print(C, max_iter, ", acc :", acc_scores[(C, max_iter)])
            print()
    print("---")
    print()
    print("The best Acc score:")
    print()
    print(max(acc_scores.values()))
    print()
    print("The best Matthew score:")
    print()
    print(max(matt_scores.values()))

except:
    print("Data is not labeled")

  0%|          | 0/10 [00:00<?, ?it/s]

0.0001 1 , matt: 1.0
0.0001 1 , acc : 1.0

0.0001 2 , matt: 1.0
0.0001 2 , acc : 1.0

0.0001 3 , matt: 1.0
0.0001 3 , acc : 1.0

0.0001 5 , matt: 1.0
0.0001 5 , acc : 1.0

0.0001 10 , matt: 0.0
0.0001 10 , acc : 0.6666666666666666

0.0001 25 , matt: 0.0
0.0001 25 , acc : 0.6666666666666666

0.0001 50 , matt: 0.0
0.0001 50 , acc : 0.6666666666666666

0.0001 100 , matt: 0.0
0.0001 100 , acc : 0.6666666666666666

0.0001 500 , matt: 0.0
0.0001 500 , acc : 0.6666666666666666

0.0001 1000 , matt: 0.0
0.0001 1000 , acc : 0.6666666666666666

0.0001 2000 , matt: 0.0
0.0001 2000 , acc : 0.6666666666666666

0.0005 1 , matt: 1.0
0.0005 1 , acc : 1.0

0.0005 2 , matt: 1.0
0.0005 2 , acc : 1.0

0.0005 3 , matt: 1.0
0.0005 3 , acc : 1.0

0.0005 5 , matt: 1.0
0.0005 5 , acc : 1.0

0.0005 10 , matt: 1.0
0.0005 10 , acc : 1.0

0.0005 25 , matt: 1.0
0.0005 25 , acc : 1.0

0.0005 50 , matt: 1.0
0.0005 50 , acc : 1.0

0.0005 100 , matt: 1.0
0.0005 100 , acc : 1.0

0.0005 500 , matt: 1.0
0.0005 500 , acc : 