In [1]:
import pandas as pd

**Load datasets**

In [2]:
depressed = pd.read_csv("datasets/Depressed-Found-BDI-Sen-Users-Genders.csv")
control = pd.read_csv("datasets/Gendered_Control_Sen.csv")

In [3]:
train = pd.read_csv("datasets/train-with-severities-and-multilabels.csv")
val = pd.read_csv("datasets/val-with-severities-and-multilabels.csv")
test = pd.read_csv("datasets/test-with-severities-and-multilabels.csv")

**Analyze datasplits**

In [4]:
#function to attach genders to the original depressed dataset
def gender_attach(dataset):
    for i in range(len(dataset)):
        sentence = dataset.loc[i, "Sentence"]
        for j in range(len(depressed)):
            if sentence in depressed.loc[j, "Sentence"]:
                dataset.loc[i, "Subject"] = depressed.loc[j, "Subject"]
                dataset.loc[i, "Gender"] = depressed.loc[j, "Gender"]
    return dataset

In [5]:
train = gender_attach(train)
val = gender_attach(val)
test = gender_attach(test)

In [6]:
#train.to_csv("gendered_og_train.csv", index=False)
#val.to_csv("gendered_og_val.csv", index=False)
#test.to_csv("gendered_og_test.csv", index=False)

In [7]:
train["Subject"].unique()

array(['Subject 2341', 'Subject 7039', 'Subject 1272', 'Subject 3707',
       'Subject 2961', 'Subject 5791', 'Subject 2903', 'Subject 9694',
       'Subject 3993', 'Subject 2827', 'Subject 6635', 'Subject 9798',
       'Subject 2432', 'Subject 9218', 'Subject 6619', 'Subject 6900',
       nan, 'Subject 9454', 'Subject 436.', 'Subject 4058',
       'Subject 5897'], dtype=object)

In [8]:
val["Subject"].unique()

array(['Subject 2827', 'Subject 3707', 'Subject 2341', 'Subject 2961',
       'Subject 9798', 'Subject 7039', 'Subject 9454', 'Subject 3993',
       'Subject 6635', 'Subject 6619', 'Subject 6900', 'Subject 2432',
       'Subject 4058', 'Subject 5791', nan], dtype=object)

In [9]:
test["Subject"].unique()

array(['Subject 2432', 'Subject 2827', 'Subject 9694', 'Subject 6635',
       'Subject 9454', 'Subject 3707', 'Subject 9798', 'Subject 5791',
       'Subject 3993', 'Subject 2341', 'Subject 6619', 'Subject 2903',
       'Subject 7039', 'Subject 2961', 'Subject 9218', 'Subject 4058',
       'Subject 1272', 'Subject 6900', nan], dtype=object)

In [10]:
train = train.drop_duplicates()
test = test.drop_duplicates()
val = val.drop_duplicates()

In [11]:
#number of sentences per gender - train

unique_genders = train['Gender'].unique()
counts = {}

for value in unique_genders:
    counts[value] = len(train[train['Gender'] == value])

for value, count in counts.items():
    print(f"Number of sentences in train where gender is {value}: {count}")

Number of sentences in train where gender is 0.0: 156
Number of sentences in train where gender is 1.0: 270
Number of sentences in train where gender is nan: 0


In [12]:
#number of sentences per gender - val

unique_genders = val['Gender'].unique()
counts = {}

for value in unique_genders:
    counts[value] = len(val[val['Gender'] == value])

for value, count in counts.items():
    print(f"Number of sentences in val where gender is {value}: {count}")

Number of sentences in val where gender is 1.0: 46
Number of sentences in val where gender is 0.0: 32
Number of sentences in val where gender is nan: 0


In [13]:
#number of sentences per gender - test

unique_genders = test['Gender'].unique()
counts = {}

for value in unique_genders:
    counts[value] = len(test[test['Gender'] == value])

for value, count in counts.items():
    print(f"Number of sentences in test where gender is {value}: {count}")

Number of sentences in test where gender is 0.0: 59
Number of sentences in test where gender is 1.0: 100
Number of sentences in test where gender is nan: 0


**Anonymizing control set**

In [14]:
#anonymizing the control set

unique_subjects = control["Subject"].unique()

anon_usernames = ['Subject ' + str(i) for i in range(100, len(unique_subjects) + 101)]

user_mapping = dict(zip(unique_subjects, anon_usernames))

control['Subject'] = control['Subject'].map(user_mapping)

In [15]:
gendered = pd.concat([depressed, control], ignore_index=True)

In [16]:
#Beck's symptom categories

affective = [3, 6, 12, 15]
motivational = [4, 11]
cognitive = [14, 21]
cog_distortions = [5, 7, 8, 9, 10, 16]
behavioral = [13, 17, 19, 22]
physiological = [18, 20, 23]

symptom_cat = [affective, motivational, cognitive, cog_distortions, behavioral, physiological]
symptom_cat_names = ['Affective', 'Motivational', 'Cognitive', 'Cog_distortions', 'Behavioral', 'Physiological']

In [17]:
gendered['Affective'] = (gendered[gendered.columns[affective]]==1).any(axis=1).astype(int)
gendered['Motivational'] = (gendered[gendered.columns[motivational]]==1).any(axis=1).astype(int)
gendered['Cognitive'] = (gendered[gendered.columns[cognitive]]==1).any(axis=1).astype(int)
gendered['Cog_distortions'] = (gendered[gendered.columns[cog_distortions]]==1).any(axis=1).astype(int)
gendered['Behavioral'] = (gendered[gendered.columns[behavioral]]==1).any(axis=1).astype(int)
gendered['Physiological'] = (gendered[gendered.columns[physiological]]==1).any(axis=1).astype(int)

In [18]:
#gendered.to_csv('all-gendered.csv', index=False)

**New datasplit to account for genders and users**

In [19]:
#go for 7:1:2 -> 385 (8m: 148 sen, 6f: 236) , 55 (1m: 21 sen, 1f: 34), 110 (2m: 42 sen, 2f: 67)
#Number of depr sentences where gender is 0: 212
#Number of depr sentences where gender is 1: 337

#dropped aim: 354 train (7male, 6fem), 66 val (1male, 1fem), 130(3male, 2fem) test for depressed 

In [20]:
fem_control = control[control["Gender"]==1]
male_control = control[control["Gender"]==0]

fem_depressed = depressed[depressed["Gender"]==1]
male_depressed = depressed[depressed["Gender"]==0]

In [21]:
unique_subjects_fem = fem_depressed['Subject'].unique()
counts = []
names = []

for value in unique_subjects_fem:
    names.append(value)
    counts.append(len(depressed[depressed['Subject'] == value]))

for i in range(len(counts)):
    print(f"Number of sentences from {names[i]}: {counts[i]}")

Number of sentences from Subject 2827: 133
Number of sentences from Subject 3707: 93
Number of sentences from Subject 2903: 10
Number of sentences from Subject 436.: 2
Number of sentences from Subject 5897: 2
Number of sentences from Subject 6635: 33
Number of sentences from Subject 6900: 24
Number of sentences from Subject 7039: 26
Number of sentences from Subject 9454: 14


In [22]:
unique_subjects_male = male_depressed['Subject'].unique()
counts = []
names = []

for value in unique_subjects_male:
    names.append(value)
    counts.append(len(depressed[depressed['Subject'] == value]))

for i in range(len(counts)):
    print(f"Number of sentences from {names[i]}: {counts[i]}")

Number of sentences from Subject 1272: 5
Number of sentences from Subject 2341: 19
Number of sentences from Subject 2432: 15
Number of sentences from Subject 9218: 17
Number of sentences from Subject 2961: 37
Number of sentences from Subject 9798: 18
Number of sentences from Subject 3993: 44
Number of sentences from Subject 4058: 9
Number of sentences from Subject 5791: 16
Number of sentences from Subject 6619: 17
Number of sentences from Subject 9694: 15


In [23]:
#val: Subject 7039 with 26, Subject 2341 with 19

#test: Subject 5535 and Subject 6900 with 57, Subject 1272 and Subject 2961 with 42

#train: Subject 2827, Subject 3707, Subject 2903, Subject 436., Subject 5897, Subject 9454 with 254
#Subject 2432, Subject 9218, Subject 9798, Subject 3993, Subject 4058, Subject 5791, Subject 6619, Subject 9694 with 151

In [24]:
#split for depressed users

depr_vals = [depressed[depressed["Subject"]=="Subject 7039"], depressed[depressed["Subject"]=="Subject 2341"]]
depr_tests = [depressed[depressed["Subject"]=="Subject 5535"], depressed[depressed["Subject"]=="Subject 6900"],
                 depressed[depressed["Subject"]=="Subject 1272"], depressed[depressed["Subject"]=="Subject 2961"]]
depr_vals_tests = ["Subject 7039", "Subject 2341", "Subject 5535", "Subject 6900", "Subject 1272", "Subject 2961"]

depr_val = pd.concat(depr_vals, ignore_index=True)
depr_test = pd.concat(depr_tests, ignore_index=True)
depr_train = depressed[~depressed['Subject'].isin(depr_vals_tests)]
depr_train = depr_train.reset_index(drop=True)

In [25]:
unique_subjects_fem = fem_control['Subject'].unique()
counts = []
names = []

for value in unique_subjects_fem:
    names.append(value)
    counts.append(len(control[control['Subject'] == value]))

for i in range(len(counts)):
    print(f"Number of sentences from {names[i]}: {counts[i]}")

Number of sentences from Subject 111: 44
Number of sentences from Subject 112: 39
Number of sentences from Subject 113: 36
Number of sentences from Subject 114: 30
Number of sentences from Subject 115: 35
Number of sentences from Subject 116: 44
Number of sentences from Subject 117: 41
Number of sentences from Subject 118: 31
Number of sentences from Subject 119: 37


In [26]:
unique_subjects_male = male_control['Subject'].unique()
counts = []
names = []

for value in unique_subjects_male:
    names.append(value)
    counts.append(len(control[control['Subject'] == value]))

for i in range(len(counts)):
    print(f"Number of sentences from {names[i]}: {counts[i]}")

Number of sentences from Subject 100: 26
Number of sentences from Subject 101: 19
Number of sentences from Subject 102: 18
Number of sentences from Subject 103: 23
Number of sentences from Subject 104: 25
Number of sentences from Subject 105: 25
Number of sentences from Subject 106: 19
Number of sentences from Subject 107: 23
Number of sentences from Subject 108: 7
Number of sentences from Subject 109: 17
Number of sentences from Subject 110: 10


In [27]:
#split for control users

control_vals = [control[control["Subject"]=="Subject 115"], control[control["Subject"]=="Subject 101"]]
control_tests = [control[control["Subject"]=="Subject 118"], control[control["Subject"]=="Subject 113"],
                 control[control["Subject"]=="Subject 103"], control[control["Subject"]=="Subject 106"]]
control_vals_tests = ["Subject 115", "Subject 101", "Subject 118", "Subject 113", "Subject 103", "Subject 106"]

control_val = pd.concat(control_vals, ignore_index=True)
control_test = pd.concat(control_tests, ignore_index=True)
control_train = control[~control['Subject'].isin(control_vals_tests)]
control_train = control_train.reset_index(drop=True)

In [28]:
#creating the final datasplit

gendered_val = pd.concat([depr_val, control_val], ignore_index=True)
gendered_test = pd.concat([depr_test, control_test], ignore_index=True)
gendered_train = pd.concat([depr_train, control_train], ignore_index=True)

In [29]:
len(gendered_train)

824

In [30]:
len(gendered_val)

99

In [31]:
len(gendered_test)

175

In [33]:
gendered_test.columns[3:24]

Index(['Sadness', 'Pessimism', 'Sense_of_failure', 'Loss_of_Pleasure',
       'Guilty_feelings', 'Sense_of_punishment', 'Self-dislike',
       'Self-incrimination', 'Suicidal_ideas', 'Crying', 'Agitation',
       'Social_withdrawal', 'Indecision', 'Feelings_of_worthlessness',
       'Loss_of_energy', 'Change_of_sleep', 'Irritability',
       'Changes_in_appetite', 'Concentration_difficulty',
       'Tiredness_or_fatigue', 'Loss_of_interest_in_sex'],
      dtype='object')

In [34]:
counts = []
names = []

for name in gendered_test.columns[3:24]:
    names.append(name)
    counts.append(len(gendered_test[gendered_test[name] == 1]))

for i in range(len(counts)):
    print(f"Number of sentences for symptom {names[i]}: {counts[i]}")

Number of sentences for symptom Sadness: 15
Number of sentences for symptom Pessimism: 5
Number of sentences for symptom Sense_of_failure: 5
Number of sentences for symptom Loss_of_Pleasure: 17
Number of sentences for symptom Guilty_feelings: 0
Number of sentences for symptom Sense_of_punishment: 0
Number of sentences for symptom Self-dislike: 7
Number of sentences for symptom Self-incrimination: 0
Number of sentences for symptom Suicidal_ideas: 8
Number of sentences for symptom Crying: 6
Number of sentences for symptom Agitation: 4
Number of sentences for symptom Social_withdrawal: 6
Number of sentences for symptom Indecision: 0
Number of sentences for symptom Feelings_of_worthlessness: 4
Number of sentences for symptom Loss_of_energy: 2
Number of sentences for symptom Change_of_sleep: 1
Number of sentences for symptom Irritability: 8
Number of sentences for symptom Changes_in_appetite: 2
Number of sentences for symptom Concentration_difficulty: 1
Number of sentences for symptom Tired

In [156]:
#gendered_train.to_csv("gendered_train.csv", index=False)

In [157]:
#gendered_val.to_csv("gendered_val.csv", index=False)

In [158]:
#gendered_test.to_csv("gendered_test.csv", index=False)