# Initial Setup

In [1]:
%%capture
!pip install pylangacq
!pip install sentence_transformers
!pip install Levenshtein

In [2]:
from __future__ import print_function
import time
import numpy as np
import pandas as pd

import pylangacq as pla

from collections import Counter
from sentence_transformers import SentenceTransformer,util
import Levenshtein


In [3]:
PATH_PREFIX = "/content/drive/MyDrive/PHD/AuthorshipObfuscation"
#DIR = f"/content/gdrive/MyDrive/Path/to/Dementiabank/folder"

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load data

In [16]:
def load_dementia(DIR, state = "_clean", augmentation = None):
  train_path = f'{DIR}/adress_train_full{state}.csv'
  test_path = f'{DIR}/adress_test_full{state}.csv'
  #val_path = f'{DIR}/adress_val_sent.csv'

  train_data = pd.read_csv(train_path)
  test_data = pd.read_csv(test_path)
  #val_data = pd.read_csv(val_path)

  train_data.columns = ['Text', 'Intent']
  test_data.columns = ['Text', 'Intent']
  #val_data.columns = ['Text', 'Intent']

  #intent2idx = {'cc':0, 'cd':1}
  #idx2intent = {0:'cc', 1:'cd'}
  train_data['Text'] = train_data['Text'].apply(lambda x : x.replace('[SEP]',''))

  if augmentation:
      if augmentation in ['synth_aug','paraphrase']:
        data_aug = pd.read_csv(f'{DIR}/augmentations_text/{augmentation}_dataset.csv')      
        data_aug = data_aug.rename(columns={"paraphrase": "Text", 'Intent':'Intent'})

      else:
        print(f'Augmenting dataset with {augmentation}')
        train_aug_path_ad = f'{DIR}/augmentations_text/{augmentation}_dataset_ad.txt'
        train_aug_path_hc = f'{DIR}/augmentations_text/{augmentation}_dataset_hc.txt'

        file_order_ad = f'{DIR}/augmentations_text/_Filenames_list_dementia.txt'
        file_order_hc = f'{DIR}/augmentations_text/_Filenames_list_control.txt'

        data_ad = pd.read_csv(train_aug_path_ad, sep='\t', header=None)
        s_ad = pd.read_csv(file_order_ad, sep='\t', header=None)
        data_ad.columns = ["Text"]
        data_ad['Intent'] = 'cd'
        data_ad['Sample'] = s_ad[0]

        data_cc = pd.read_csv(train_aug_path_hc, sep='\t', header=None)
        s_cc = pd.read_csv(file_order_hc, sep='\t', header=None)
        data_cc.columns = ["Text"]
        data_cc['Intent'] = 'cc'
        data_cc['Sample'] = s_cc[0]

        data_aug = data_cc.append(data_ad)
        data_aug = data_aug.sort_values(by='Sample')

        data_aug['Intent'] = data_aug['Intent'].apply(lambda x : 0 if x == "cc" else 1)
        data_aug['Text'] = data_aug['Text'].apply(lambda x : x.replace('[SEP]',''))
        
      return train_data, data_aug
    
  return train_data


# Analysis

## Helper functions

In [17]:
def get_ttr(text):  
    numtokens = len(text)
    freq_token_type = Counter(text)
    v = len(freq_token_type)
    ttr = float(v)/numtokens
    return ttr

def get_levensthein(a,b): 
  distance = Levenshtein.distance(a.lower(), b.lower())
  return distance

def get_ttr_mean(aug_data):
  ttr_list= np.vectorize(get_ttr)(aug_data['Text'])
  return np.mean(ttr_list),np.std(ttr_list)

def get_leventshein_mean(train_data, aug_data):
  lv_list= np.vectorize(get_levensthein)(train_data['Text'],aug_data['Text'])
  return np.mean(lv_list),np.std(lv_list)

def get_similarity_mean(train_data, aug_data, model):
  sentences1 = train_data['Text'].values
  sentences2 = aug_data['Text'].values

  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)

  cosine_scores = util.cos_sim(embeddings1, embeddings2)
  cos_list=[]

  #Output the pairs with their score
  for i in range(len(sentences1)):
      #print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))
      cos_list.append(cosine_scores[i][i])

  return np.mean(cos_list),np.std(cos_list)
  

## Compute stats for all augmentations

In [22]:
PATH_PREFIX_i = "/content/drive/MyDrive/PHD/AuthorshipObfuscation"
DIR = f'{PATH_PREFIX_i}/Adress-2020'
model = SentenceTransformer('all-mpnet-base-v2')

augmentation = ['Base','BT_EN_RU','BT_EN_DE','Mixup_aug_half1', 'SD_aug','paraphrase','synth_aug','EDA_aug','Context_aug']
rows = []

## get original texts to compare to
args = {'DIR':f'{DIR}', 'augmentation':'Base'}
train_data, aug_data_base = load_dementia(**args)
aug_data_base['Intent'] = aug_data_base['Intent'].apply(lambda x: "cc" if x == 0 else "cd" )

for aug in augmentation:
  args = {'DIR':f'{DIR}', 'augmentation':aug}
  if aug:
    train_data, aug_data  = load_dementia(**args)
    if aug == "paraphrase" or aug == "synth_aug":
      aug_data['Intent'] = aug_data['Intent'].apply(lambda x: 0 if x == "cc" else 1 )

    lev, sd = get_leventshein_mean(aug_data_base, aug_data)
    simi, simi_sd = get_similarity_mean(aug_data_base, aug_data, model)
    ttr, ttr_sd = get_ttr_mean(aug_data)

  else:
    train_data  = load_dementia(**args)
    aug_data = train_data
    lev=0
    sd=0
    simi=0
    simi_sd=0
    ttr, ttr_sd = get_ttr_mean(train_data)

  rows.append([aug,lev,sd, simi,simi_sd,ttr,ttr_sd])

df_aug_mean=pd.DataFrame(rows, columns=['Augmentation','Levenstein','SD', "Similarity", "Simi SD","TTR","TTR SD"])

Augmenting dataset with Base
Augmenting dataset with Base
Augmenting dataset with BT_EN_RU
Augmenting dataset with BT_EN_DE
Augmenting dataset with Mixup_aug_half1
Augmenting dataset with SD_aug
Augmenting dataset with EDA_aug
Augmenting dataset with Context_aug


In [23]:
df_aug_mean

Unnamed: 0,Augmentation,Levenstein,SD,Similarity,Simi SD,TTR,TTR SD
0,Base,0.0,0.0,0.911511,0.065318,0.064495,0.030387
1,BT_EN_RU,227.731481,229.072349,0.911511,0.065318,0.08199,0.038232
2,BT_EN_DE,177.648148,161.689593,0.911511,0.065318,0.079888,0.033957
3,Mixup_aug_half1,258.240741,145.895756,0.911511,0.065318,0.120407,0.059026
4,SD_aug,70.75,67.373493,0.911511,0.065318,0.080456,0.049468
5,paraphrase,267.194444,177.722081,0.911511,0.065318,0.105233,0.034935
6,synth_aug,411.231481,201.081272,0.911511,0.065318,0.079246,0.027454
7,EDA_aug,39.527778,27.307392,0.911511,0.065318,0.064379,0.029991
8,Context_aug,47.555556,11.009536,0.911511,0.065318,0.0725,0.03625


In [None]:
df_aug_mean.to_csv("PATH/TO/SAVE/aug_stats.csv")