**Imports**

In [None]:
import nltk
import pandas as pd

from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
nltk.download('wordnet')

In [None]:
pd.set_option('display.max_columns', None)

**Preparation of the train set**

In [None]:
trans = pd.read_csv('all-gendered.csv')

In [None]:
trans.columns

In [None]:
len(trans[trans['Physiological']==1])

In [None]:
train = pd.read_csv('all-gendered.csv')

In [None]:
#the original train set has some duplicates, which should be kept in the final train set
eda_train = train[train['Label'].notna()].copy()
eda_train = eda_train[eda_train['Sentence'].notna()]

eda_train = eda_train.drop_duplicates(subset=['Sentence'])

In [None]:
len(train)

In [None]:
control = eda_train[eda_train['Label']==0].copy()

In [None]:
energyloss = eda_train[eda_train['Loss_of_energy']==1].copy()
agitation = eda_train[eda_train['Agitation']==1].copy()
sadness = eda_train[eda_train['Sadness']==1].copy()
irritability = eda_train[eda_train['Irritability']==1].copy()
socialwithdr = eda_train[eda_train['Social_withdrawal']==1].copy()
failsense = eda_train[eda_train['Sense_of_failure']==1].copy()

In [None]:
train.columns[26:32]

In [None]:
affective = eda_train[eda_train['Affective']==1].copy()
motivational = eda_train[eda_train['Motivational']==1].copy()
cognitive = eda_train[eda_train['Cognitive']==1].copy()
cog_distortions = eda_train[eda_train['Cog_distortions']==1].copy()
behavioral = eda_train[eda_train['Behavioral']==1].copy()
physiological = eda_train[eda_train['Physiological']==1].copy()

In [None]:
for name in train.columns[26:32]:
    count = train[train[name]==1]['Gender'].value_counts()
    print(name, count)

In [None]:
male_control = control[control['Gender']==0].copy()

In [None]:
male_affective = affective[affective['Gender']==0].copy()
male_motivational = motivational[motivational['Gender']==0].copy()
fem_cognitive = cognitive[cognitive['Gender']==1].copy()
male_cog_distortions = cog_distortions[cog_distortions['Gender']==0].copy()
male_behavioral = behavioral[behavioral['Gender']==0].copy()
male_physiological = physiological[physiological['Gender']==0].copy()

In [None]:
male_control = male_control.reset_index(drop=True)

In [None]:
energyloss = energyloss.reset_index(drop=True)
agitation = agitation.reset_index(drop=True)
sadness = sadness.reset_index(drop=True)
irritability = irritability.reset_index(drop=True)
socialwithdr = socialwithdr.reset_index(drop=True)
fem_socialwithdr = fem_socialwithdr.reset_index(drop=True)
male_failsense = male_failsense.reset_index(drop=True)

In [None]:
male_affective = male_affective.reset_index(drop=True)
male_motivational = male_motivational.reset_index(drop=True)
fem_cognitive = fem_cognitive.reset_index(drop=True)
male_cog_distortions = male_cog_distortions.reset_index(drop=True)
male_behavioral = male_behavioral.reset_index(drop=True)
male_physiological = male_physiological.reset_index(drop=True)

**EDA implementation taken from Jason Wei and Kai Zou**

In [None]:
# Easy data augmentation techniques for text classification
# Jason Wei and Kai Zou
import random
from random import shuffle
random.seed(1)

#stop words list
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

In [None]:
#cleaning up text
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

In [None]:
########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet
#import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet 

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

In [None]:
########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word != '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	#ri
	if (alpha_ri > 0):
		n_ri = max(1, int(alpha_ri*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_insertion(words, n_ri)
			augmented_sentences.append(' '.join(a_words))

	#rs
	if (alpha_rs > 0):
		n_rs = max(1, int(alpha_rs*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_swap(words, n_rs)
			augmented_sentences.append(' '.join(a_words))

	#rd
	if (p_rd > 0):
		for _ in range(num_new_per_technique):
			a_words = random_deletion(words, p_rd)
			augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences

In [None]:
# Easy data augmentation techniques for text classification
# Jason Wei and Kai Zou

#generate more data with standard augmentation
def gen_eda(train_orig, alpha_sr, alpha_ri, alpha_rs, alpha_rd, num_aug=9):
    
    augmented_rows = []
    
    for i in range(len(train_orig)):
        sentence = train_orig.loc[i, 'Sentence']
        aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            new_inst = train_orig.iloc[i].copy()
            new_inst['Sentence'] = aug_sentence
            augmented_rows.append(new_inst)

    augmented_df = pd.DataFrame(augmented_rows)
    return augmented_df

In [None]:
#number of augmented sentences to generate per original sentence
aff_num_aug = 1
mot_num_aug = 4 #for cv model 2
cog_num_aug = 1
cog_dist_num_aug = 4 #for cv model 2
beh_num_aug = 2
control_num_aug = 1
#phys_num_aug = 1


#how much to replace each word by synonyms
alpha_sr = 0.05

#how much to insert new words that are synonyms
alpha_ri = 0.05

#how much to swap words
alpha_rs = 0

#how much to delete words
alpha_rd = 0

if alpha_sr == alpha_ri == alpha_rs == alpha_rd == 0:
     print('At least one alpha should be greater than zero')

In [None]:
#augmentation negative sentences for the GABDI-CV model
male_control_aug_data = gen_eda(male_control, alpha_sr, alpha_ri, alpha_rs, alpha_rd, control_num_aug)

In [None]:
male_aff_aug_data = gen_eda(male_affective, alpha_sr, alpha_ri, alpha_rs, alpha_rd, aff_num_aug)
male_mot_aug_data = gen_eda(male_motivational, alpha_sr, alpha_ri, alpha_rs, alpha_rd, mot_num_aug)
fem_cognitive_aug_data = gen_eda(fem_cognitive, alpha_sr, alpha_ri, alpha_rs, alpha_rd, cog_num_aug)
male_cog_distortions_aug_data = gen_eda(male_cog_distortions, alpha_sr, alpha_ri, alpha_rs, alpha_rd, cog_dist_num_aug)
male_beh_aug_data = gen_eda(male_behavioral, alpha_sr, alpha_ri, alpha_rs, alpha_rd, beh_num_aug)
#if GABDI CV model:
male_phys_aug_data = gen_eda(male_physiological, alpha_sr, alpha_ri, alpha_rs, alpha_rd, phys_num_aug)

In [None]:
male_control_aug_data = male_control_aug_data.sample(125)

In [None]:
#for category model
male_aff_aug_data = male_aff_aug_data.sample(113)
male_mot_aug_data = male_mot_aug_data.sample(69)
fem_cognitive_aug_data = fem_cognitive_aug_data.sample(12)
male_cog_distortions_aug_data = male_cog_distortions_aug_data.sample(111)
male_beh_aug_data = male_beh_aug_data.sample(63)

In [None]:
#for GABDI CV model
male_aff_aug_data = male_aff_aug_data.sample(41)
male_mot_aug_data = male_mot_aug_data.sample(54)
fem_cognitive_aug_data = fem_cognitive_aug_data.sample(5)
male_cog_distortions_aug_data = male_cog_distortions_aug_data.sample(48)
male_beh_aug_data = male_beh_aug_data.sample(35)
male_phys_aug_data = male_phys_aug_data.sample(1)

In [None]:
#category model
aug_data = pd.concat([male_aff_aug_data, male_mot_aug_data, fem_cognitive_aug_data, 
                      male_cog_distortions_aug_data, male_beh_aug_data])

In [None]:
#GABDI-CV model
aug_data = pd.concat([male_aff_aug_data, male_mot_aug_data, fem_cognitive_aug_data, 
                      male_cog_distortions_aug_data, male_beh_aug_data, male_phys_aug_data])

In [None]:
#fem_aug_data = fem_cognitive_aug_data.sample(12)
#male_aug_data = male_cog_distortions_aug_data.sample(111, replace=True)

In [None]:
#aug_data = pd.concat([fem_aug_data, male_aug_data], ignore_index=True)

In [None]:
train_all = pd.concat([train, male_control_aug_data], ignore_index=True)

In [None]:
train_all = pd.concat([train, aug_data], ignore_index=True)
#train_cognitive = pd.concat([train, fem_aug_data], ignore_index=True)
#train_cog_distortions = pd.concat([train, male_aug_data], ignore_index=True)

In [None]:
#train_all.to_csv("cog_cogdist_syn_eda_train.csv", index=False)
#train_cognitive.to_csv('cog_syn_eda_train.csv', index=False)
#train_cog_distortions.to_csv('cogdist_syn_eda_train.csv', index=False)

In [None]:
#train_all.to_csv("fail_social_back_trans_train.csv", index=False)
#train_socialwithdr.to_csv('socialwithdr_back_trans_train.csv', index=False)
#train_failsense.to_csv('failsense_back_trans_train.csv', index=False)