**Imports**

In [None]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

**Preparation of the train set**

In [None]:
train = pd.read_csv('all-gendered.csv')

In [None]:
#the original train set has some duplicates, which should be kept in the final train set
trans_train = train[train['Label'].notna()]
trans_train = trans_train[trans_train['Sentence'].notna()]
trans_train = trans_train.drop_duplicates(subset=['Sentence'])

In [None]:
control = trans_train[trans_train['Label']==0].copy()

In [None]:
len(control)

In [None]:
train[train['Label']==0]['Gender'].value_counts()

In [None]:
energyloss = trans_train[trans_train['Loss_of_energy']==1].copy()
agitation = trans_train[trans_train['Agitation']==1].copy()
sadness = trans_train[trans_train['Sadness']==1].copy()
irritability = trans_train[trans_train['Irritability']==1].copy()
socialwithdr = trans_train[trans_train['Social_withdrawal']==1].copy()
failsense = trans_train[trans_train['Sense_of_failure']==1].copy()

In [None]:
affective = trans_train[trans_train['Affective']==1].copy()
motivational = trans_train[trans_train['Motivational']==1].copy()
cognitive = trans_train[trans_train['Cognitive']==1].copy()
cog_distortions = trans_train[trans_train['Cog_distortions']==1].copy()
behavioral = trans_train[trans_train['Behavioral']==1].copy()
physiological = trans_train[trans_train['Physiological']==1].copy()

In [None]:
cognitive = trans_train[trans_train['Cognitive']==1]
cog_distortions = trans_train[trans_train['Cog_distortions']==1]

In [None]:
train[train['Cog_distortions']==1]['Gender'].value_counts()

In [None]:
male_control = control[control['Gender']==0].copy()

In [None]:
male_affective = affective[affective['Gender']==0].copy()
male_motivational = motivational[motivational['Gender']==0].copy()
fem_cognitive = cognitive[cognitive['Gender']==1].copy()
male_cog_distortions = cog_distortions[cog_distortions['Gender']==0].copy()
male_behavioral = behavioral[behavioral['Gender']==0].copy()
male_physiological = physiological[physiological['Gender']==0].copy()

In [None]:
fem_socialwithdr = socialwithdr[socialwithdr['Gender']==1].copy()
male_failsense = failsense[failsense['Gender']==0].copy()

In [None]:
male_control = male_control.reset_index(drop=True)

In [None]:
energyloss = energyloss.reset_index(drop=True)
agitation = agitation.reset_index(drop=True)
sadness = sadness.reset_index(drop=True)
irritability = irritability.reset_index(drop=True)
socialwithdr = socialwithdr.reset_index(drop=True)
fem_socialwithdr = fem_socialwithdr.reset_index(drop=True)
male_failsense = male_failsense.reset_index(drop=True)

In [None]:
male_affective = male_affective.reset_index(drop=True)
male_motivational = male_motivational.reset_index(drop=True)
fem_cognitive = fem_cognitive.reset_index(drop=True)
male_cog_distortions = male_cog_distortions.reset_index(drop=True)
male_behavioral = male_behavioral.reset_index(drop=True)
male_physiological = male_physiological.reset_index(drop=True)

**Translation model**

In [None]:
#tokenizers for each language
eng_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
ger_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", token=True, src_lang="deu_Latn")
chin_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", token=True, src_lang="zho_Hans")
arab_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", token=True, src_lang="arb_Arab")
span_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", token=True, src_lang="spa_Latn")

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", token=True)

**Back Translation**

In [None]:
def back_translation(sen):
    inputs = eng_tokenizer(sen, return_tensors='pt')
    
    langs = [['deu_Latn', ger_tokenizer] , ['zho_Hans', chin_tokenizer], 
             ['arb_Arab', arab_tokenizer], ['spa_Latn', span_tokenizer]]
    trans = [sen]
    for lang in langs:
        trans_token = model.generate(**inputs, forced_bos_token_id=eng_tokenizer.lang_code_to_id[lang[0]], max_length=50)
        trans1 = eng_tokenizer.batch_decode(trans_token, skip_special_tokens=True)[0]
        inputs2 = lang[1](trans1, return_tensors='pt')
        english_token = model.generate(**inputs2, forced_bos_token_id=chin_tokenizer.lang_code_to_id["eng_Latn"], max_length=50)
        lang_trans = lang[1].batch_decode(english_token, skip_special_tokens=True)[0]
        trans.append(lang_trans)
    trans = list(set(trans))
    trans.remove(sen)
    return trans

In [None]:
def gen_back_translation(df):
    augmented_rows = []
    
    for i in range(len(df)):
        sentence = df.loc[i, 'Sentence']
        aug_sentences = back_translation(sentence)
        for aug_sentence in aug_sentences:
            new_inst = df.iloc[i].copy()
            new_inst['Sentence'] = aug_sentence
            augmented_rows.append(new_inst)

    augmented_df = pd.DataFrame(augmented_rows)
    return augmented_df  

In [None]:
#augmentation negative sentences for the GABDI-CV model
male_control_aug_data = gen_back_translation(male_control)

In [None]:
male_aff_aug_data = gen_back_translation(male_affective)
male_mot_aug_data = gen_back_translation(male_motivational)
fem_cognitive_aug_data = gen_back_translation(fem_cognitive)
male_cog_distortions_aug_data = gen_back_translation(male_cog_distortions)
male_beh_aug_data = gen_back_translation(male_behavioral)
male_phys_aug_data = gen_back_translation(male_physiological)

In [None]:
fem_socialwithdr_aug_data = gen_back_translation(fem_socialwithdr)
male_failsense_aug_data = gen_back_translation(male_failsense)

In [None]:
male_control_aug_data2 = male_control_aug_data.sample(125)

In [None]:
#for the catgory model
male_aff_aug_data2 = male_aff_aug_data.sample(113)
male_mot_aug_data2 = male_mot_aug_data.sample(69)
fem_cognitive_aug_data2 = fem_cognitive_aug_data.sample(12)
male_cog_distortions_aug_data2 = male_cog_distortions_aug_data.sample(111)
male_beh_aug_data2 = male_beh_aug_data.sample(63)

aug_data = pd.concat([male_aff_aug_data2, male_mot_aug_data2, fem_cognitive_aug_data2, 
                      male_cog_distortions_aug_data2, male_beh_aug_data2])

In [None]:
#for the GABDI-CV model
male_aff_aug_data2 = male_aff_aug_data.sample(41)
male_mot_aug_data2 = male_mot_aug_data.sample(54)
fem_cognitive_aug_data2 = fem_cognitive_aug_data.sample(5)
male_cog_distortions_aug_data2 = male_cog_distortions_aug_data.sample(48)
male_beh_aug_data2 = male_beh_aug_data.sample(35)
male_phys_aug_data2 = male_phys_aug_data.sample(1)

aug_data = pd.concat([male_aff_aug_data2, male_mot_aug_data2, fem_cognitive_aug_data2, 
                      male_cog_distortions_aug_data2, male_beh_aug_data2, male_phys_aug_data2])

In [None]:
#aug_data = pd.concat([fem_aug_data, male_aug_data], ignore_index=True)

In [None]:
train_all = pd.concat([train, male_control_aug_data2], ignore_index=True)

In [None]:
train_all = pd.concat([train, aug_data], ignore_index=True)

In [None]:
#train_all.to_csv("all_CV_back_trans.csv", index=False)
#train_all.to_csv("all_back_trans_train.csv", index=False)

**Oversampling**

In [None]:
#augmentation negative sentences for the GABDI-CV model
sample_male_control = male_control.sample(125, replace=False)

In [None]:
#sample_fem_socialwithdr = fem_socialwithdr.sample(9, replace=True)
#sample_male_failsense = male_failsense.sample(30, replace=True)

In [None]:
#for the category model
sample_male_aff = male_affective.sample(113, replace=True)
sample_male_mot = male_motivational.sample(69, replace=True)
sample_fem_cognitive = fem_cognitive.sample(12, replace=True)
sample_male_cog_dist = male_cog_distortions.sample(111, replace=True)
sample_male_beh = male_behavioral.sample(63, replace=True)

In [None]:
#for the GABDI-Sen model
sample_male_aff = male_affective.sample(41, replace=False)
sample_male_mot = male_motivational.sample(54, replace=True)
sample_fem_cognitive = fem_cognitive.sample(5, replace=False)
sample_male_cog_dist = male_cog_distortions.sample(48, replace=True)
sample_male_beh = male_behavioral.sample(35, replace=True)
sample_male_phy = male_physiological.sample(1, replace=False)

In [None]:
#sample_fem_cognitive = fem_cognitive.sample(12, replace=True)
#sample_male_cog_dist = male_cog_distortions.sample(111, replace=True)

In [None]:
#category model
sample_aug = pd.concat([sample_male_aff, sample_male_mot, sample_fem_cognitive, 
                      sample_male_cog_dist, sample_male_beh])

In [None]:
#GABDI-Sen model
sample_aug = pd.concat([sample_male_aff, sample_male_mot, sample_fem_cognitive, 
                      sample_male_cog_dist, sample_male_beh, sample_male_phy])

In [None]:
#sample_aug = pd.concat([sample_fem_cognitive, sample_male_cog_dist], ignore_index=True)

In [None]:
sample_train_all = pd.concat([train, sample_male_control], ignore_index=True)

In [None]:
sample_train_all = pd.concat([train, sample_aug], ignore_index=True)
#sample_train_cognitive = pd.concat([train, sample_fem_cognitive], ignore_index=True)
#sample_train_cog_dist = pd.concat([train, sample_male_cog_dist], ignore_index=True)

In [None]:
#sample_train_all.to_csv("all_CV_sample.csv", index=False)

In [None]:
#sample_train_all.to_csv("cog_cogdist_sample_train.csv", index=False)
#sample_train_cognitive.to_csv('cognitive_sample_train.csv', index=False)
#sample_train_cog_dist.to_csv('cogdist_sample_train.csv', index=False)