# Combining sub-cohorts into A and D cohorts, without comorbids

In this notebook, I will combine all subcohorts with the general A and D cohorts

Then, I will filter out comorbid users and save the DataFrames for preprocessing

In [1]:
# Import libraries
import pandas as pd

## 1. Load in mapping dataset

The mapping dataset maps the usernames with their group membership to depression, anxiety or comorbid. For the sake of the RQ1 and RQ2, I only want to retain the users who belong to the depression and axiety groups.

Some users have within-group comorbidity, in which case I want to retain only the first username entry, because otherwise there's duplicates.

#### Within-cohort comorbidity

With within-cohort comorbidity I mean users who belong to more than one disorder group within the anxiety or depression cohort (e.g. anxiety and panic disorder). For the purposes of RQ01, only one of these users' data entries should be retained.

Comorbid cases in the depression_true index list are characterised by having a 'USER = USER' entry under user_id. By using an if-else statement in a for-loop, we can find these users and split these indexes, while stripping what comes after the '=' sign.

In [2]:
mapping = pd.read_csv('../Data/comorbid_with_text/cohort_mapping.tsv', sep='\t', index_col="comorbidities")

#### 1.1 Get the user_id's that belong to the depression group

In [3]:
# Get all rows for which group_depression is True
depression_true = mapping[mapping['group_depression'] == True].index

# Check how many there are
print(len(depression_true))

# Get a list of depressed users, discarding the second user-id after the '=' sign
depression_users_list = []
for user_id in depression_true:
    # Split by '=' and take the first part
    if '=' in user_id:
        depression_users_list.append(user_id.split('=')[0].strip())
    else:
        depression_users_list.append(user_id.strip())

# Check if the usernames are correct and if this number is the same as the number printed above
print(depression_users_list[:500])
len(depression_users_list)

930
['uDEP000', 'uDEP001', 'uDEP002', 'uDYS000', 'uDEP004', 'uDEP005', 'uDEP006', 'uDEP007', 'uDEP008', 'uDEP009', 'uDEP010', 'uDEP011', 'uDEP012', 'uDEP013', 'uDEP014', 'uDEP015', 'uDEP016', 'uDEP018', 'uDEP019', 'uDEP020', 'uDEP021', 'uSAD002', 'uDEP023', 'uDEP024', 'uDEP025', 'uDEP028', 'uDEP029', 'uDEP030', 'uDEP031', 'uDEP033', 'uDEP038', 'uDEP040', 'uDEP041', 'uDEP045', 'uDEP046', 'uDEP048', 'uDEP049', 'uDEP050', 'uDEP051', 'uDEP052', 'uDEP053', 'uDEP054', 'uDEP055', 'uDEP056', 'uDEP059', 'uDEP063', 'uDEP066', 'uDEP067', 'uDEP068', 'uDEP069', 'uDEP070', 'uDEP071', 'uDEP072', 'uDEP073', 'uDEP075', 'uDEP078', 'uDEP079', 'uSAD005', 'uDEP081', 'uDEP082', 'uDEP083', 'uDEP084', 'uDEP085', 'uDEP089', 'uDEP090', 'uDEP091', 'uDEP092', 'uDEP093', 'uDEP095', 'uDEP097', 'uDEP099', 'uDEP100', 'uDEP101', 'uDEP102', 'uDEP104', 'uDEP105', 'uDEP107', 'uDEP108', 'uDEP110', 'uDEP111', 'uDEP112', 'uDEP113', 'uDEP115', 'uDEP116', 'uDEP118', 'uDEP119', 'uDEP120', 'uDEP122', 'uDEP124', 'uDEP126', 'uDEP

930

Check how many comorbid individuals there are for the 'cohort construction' subsection

In [5]:
# Get all rows for which group_depression is True
comorbid_true = mapping[mapping['comorbid'] == True].index

# Check how many there are
print(len(comorbid_true))


520


#### 1.2 Get the user-id's that belong to the anxiety group

In [6]:
# Get all rows fro which group_anxiety is True
anxiety_true = mapping[mapping['group_anxiety'] == True].index

# Check how many there are
print(len(anxiety_true))

# Get a list of depressed users, discarding the second user-id after the '=' sign
anxiety_users_list = []
for user_id in anxiety_true:
    # Split by '=' and take the first part
    if '=' in user_id:
        anxiety_users_list.append(user_id.split('=')[0].strip())
    else:
        anxiety_users_list.append(user_id.strip())

# Check if the usernames are correct and if this number is the same as the number printed above
print(anxiety_users_list[:20])
print(len(anxiety_users_list))

869
['uANX000', 'uOCD000', 'uGAD073', 'uGAD162', 'uANX002', 'uANX003', 'uGAD032', 'uOCD001', 'uOCD002', 'uANX004', 'uGAD080', 'uGAD045', 'uGAD026', 'uOCD003', 'uGAD125', 'uGAD094', 'uANX009', 'uANX010', 'uANX011', 'uOCD004']
869


## 2. The demographics datasets

### 2.1. Load in the demographic datasets

In [7]:
# The depression datasets:
dep_demo = pd.read_csv('../Data/comorbid_with_text/depression/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
dys_demo = pd.read_csv('../Data/comorbid_with_text/dysthymia/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
sad_demo = pd.read_csv('../Data/comorbid_with_text/SAD/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
pdd_demo = pd.read_csv('../Data/comorbid_with_text/PDD/demographics.tsv', header=[0,1], index_col=[0], sep='\t')

# The anxiety datasets:
anx_demo = pd.read_csv('../Data/comorbid_with_text/anxiety/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
ago_demo = pd.read_csv('../Data/comorbid_with_text/agoraphobia/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
gad_demo = pd.read_csv('../Data/comorbid_with_text/GAD/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
ocd_demo = pd.read_csv('../Data/comorbid_with_text/OCD/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
panic_demo = pd.read_csv('../Data/comorbid_with_text/panic/demographics.tsv', header=[0,1], index_col=[0], sep='\t')
phobia_demo = pd.read_csv('../Data/comorbid_with_text/phobia/demographics.tsv', header=[0,1], index_col=[0], sep='\t')


In [20]:
# Get the lengths of each disorder for Table 1
# Depression
print("The number of individuals in depression group:", len(dep_demo))
print("The number of individuals in dysthymia group:", len(dys_demo))
print("The number of individuals in SAD group:", len(sad_demo))
print("The number of individuals in PDD group:", len(pdd_demo))

# Anxiety
print("The number of individuals in anxiety group:", len(anx_demo))
print("The number of individuals in agoraphobia group:", len(ago_demo))
print("The number of individuals in GAD group:", len(gad_demo))
print("The number of individuals in OCD group:", len(ocd_demo))
print("The number of individuals in panic group:", len(panic_demo))
print("The number of individuals in phobia group:", len(phobia_demo))



The number of individuals in depression group: 1434
The number of individuals in dysthymia group: 14
The number of individuals in SAD group: 7
The number of individuals in PDD group: 2
The number of individuals in anxiety group: 1065
The number of individuals in agoraphobia group: 15
The number of individuals in GAD group: 163
The number of individuals in OCD group: 234
The number of individuals in panic group: 68
The number of individuals in phobia group: 10


### 2.2. Combine subdisorder datasets into general depression(D) and anxiety (A) cohorts

In [8]:
# Depression datasets:
datasets = [dep_demo, dys_demo, sad_demo, pdd_demo]
depression_demo = pd.concat(datasets)

# Anxiety datasets:
datasets = [anx_demo, gad_demo, ocd_demo, panic_demo, phobia_demo, ago_demo]
anxiety_demo = pd.concat(datasets)

In [9]:
# Get dataset lengths
print("Number of datapoints in depression demographic cohort:", len(depression_demo))
print("Number of datapoints in anxiety demographic cohort:", len(anxiety_demo))

Number of datapoints in depression demographic cohort: 1457
Number of datapoints in anxiety demographic cohort: 1555


#### 2.3 Filter the depression demographics dataset

In [10]:
# Filter the depression DataFrame to keep only users in depression_users_list
depression_demographics = depression_demo[depression_demo.index.isin(depression_users_list)]

# Check the number of rows in the filtered DataFrame
print(f"Number of rows in the filtered DataFrame: {len(depression_demographics)}")

Number of rows in the filtered DataFrame: 930


Okay so this works.

#### 2.4 Filter the anxiety demographics dataset

In [11]:
# Filter the anxiety DataFrame to keep only users in anxiety_users_list
anxiety_demographics = anxiety_demo[anxiety_demo.index.isin(anxiety_users_list)]

# Check the number of rows in the filtered DataFrame
print(f"Number of rows in the filtered DataFrame: {len(anxiety_demographics)}")

Number of rows in the filtered DataFrame: 869


This number is the same as the filtered list, so the filtering was successful

#### 2.5 Save the filtered demographics datasets

In [14]:
pd.DataFrame(depression_demographics).to_csv('../Data/Combined_sets/depression_demographics.tsv', sep="\t")
pd.DataFrame(anxiety_demographics).to_csv('../Data/Combined_sets/anxiety_demographics.tsv', sep="\t")

## 3. Cohort-level datasets

### 3.1 Load in the cohort-level datasets

In [13]:
# The depression datasets:
dep_cohort = pd.read_csv('../Data/comorbid_with_text/depression/cohort.tsv', sep='\t', index_col='tweet_id')
dys_cohort = pd.read_csv('../Data/comorbid_with_text/dysthymia/cohort.tsv', sep='\t', index_col='tweet_id')
sad_cohort = pd.read_csv('../Data/comorbid_with_text/SAD/cohort.tsv', sep='\t', index_col='tweet_id')
pdd_cohort = pd.read_csv('../Data/comorbid_with_text/PDD/cohort.tsv', sep='\t', index_col='tweet_id')


# The anxiety datasets:
anx_cohort = pd.read_csv('../Data/comorbid_with_text/anxiety/cohort.tsv', sep='\t', index_col='tweet_id')
gad_cohort = pd.read_csv('../Data/comorbid_with_text/GAD/cohort.tsv', sep='\t', index_col='tweet_id')
ocd_cohort = pd.read_csv('../Data/comorbid_with_text/OCD/cohort.tsv', sep='\t', index_col='tweet_id')
panic_cohort = pd.read_csv('../Data/comorbid_with_text/panic/cohort.tsv', sep='\t', index_col='tweet_id')
phobia_cohort = pd.read_csv('../Data/comorbid_with_text/phobia/cohort.tsv', sep='\t', index_col='tweet_id')
ago_cohort = pd.read_csv('../Data/comorbid_with_text/agoraphobia/cohort.tsv', sep='\t', index_col='tweet_id')

#### 3.2 Combine subdisorder datasets into general depression(D) and anxiety (A) cohorts

In [14]:
# Depression datasets:
datasets = [dep_cohort, dys_cohort, sad_cohort, pdd_cohort]
depression_cohort = pd.concat(datasets)

# Anxiety datasets:
datasets = [anx_cohort, gad_cohort, ocd_cohort, panic_cohort, phobia_cohort, ago_cohort]
anxiety_cohort = pd.concat(datasets)

# Test if it worked:
print("The number of tweets in the depression set:", len(depression_cohort))
print("The number of tweets in the anxiety set:", len(anxiety_cohort))


The number of tweets in the depression set: 1921632
The number of tweets in the anxiety set: 2158123


#### 3.3 Depression cohort

In [15]:
# Filter the depression DataFrame to keep only users in depression_users_list
filtered_depression_cohort = depression_cohort[depression_cohort['user_id'].isin(depression_users_list)]

# Check the number of rows in the filtered DataFrame
print(f"Number of rows in the filtered DataFrame: {len(filtered_depression_cohort)}")

Number of rows in the filtered DataFrame: 1214351


In [16]:
# Number of tweets lost:
len(depression_cohort) - len(filtered_depression_cohort)

707281

#### 3.4 Anxiety cohort

In [17]:
# Filter the anxiety DataFrame to keep only users in anxiety_users_list
filtered_anxiety_cohort = anxiety_cohort[anxiety_cohort['user_id'].isin(anxiety_users_list)]

# Check the number of rows in the filtered DataFrame
print(f"Number of rows in the filtered DataFrame: {len(filtered_anxiety_cohort)}")

Number of rows in the filtered DataFrame: 1223491


In [18]:
# Number of tweets lost:
len(anxiety_cohort) - len(filtered_anxiety_cohort)

934632

#### 3.5 Save the cohort datasets

In [19]:
pd.DataFrame(filtered_depression_cohort).to_csv('../Data/Combined_sets/depression_cohort.tsv', sep="\t")
pd.DataFrame(filtered_anxiety_cohort).to_csv('../Data/Combined_sets/anxiety_cohort.tsv', sep="\t")

## 4. Tweet-level datasets

### 4.1. Load in the full_tweets datasets

In [22]:
# The depression datasets:
dep_tweet = pd.read_csv('../Data/comorbid_with_text/depression/full_text.tsv', sep='\t', index_col='tweet_id')
dys_tweet = pd.read_csv('../Data/comorbid_with_text/dysthymia/full_text.tsv', sep='\t', index_col='tweet_id')
sad_tweet = pd.read_csv('../Data/comorbid_with_text/SAD/full_text.tsv', sep='\t', index_col='tweet_id')
pdd_tweet = pd.read_csv('../Data/comorbid_with_text/PDD/full_text.tsv', sep='\t', index_col='tweet_id')

# The anxiety datasets:
anx_tweet = pd.read_csv('../Data/comorbid_with_text/anxiety/full_text.tsv', sep='\t', index_col='tweet_id')
gad_tweet = pd.read_csv('../Data/comorbid_with_text/GAD/full_text.tsv', sep='\t', index_col='tweet_id')
ocd_tweet = pd.read_csv('../Data/comorbid_with_text/OCD/full_text.tsv', sep='\t', index_col='tweet_id')
panic_tweet = pd.read_csv('../Data/comorbid_with_text/panic/full_text.tsv', sep='\t', index_col='tweet_id')
phobia_tweet = pd.read_csv('../Data/comorbid_with_text/phobia/full_text.tsv', sep='\t', index_col='tweet_id')
ago_tweet = pd.read_csv('../Data/comorbid_with_text/agoraphobia/full_text.tsv', sep='\t', index_col='tweet_id')

#### 4.2 Combine subdisorder datasets into general depression(D) and anxiety (A) cohorts

In [23]:
# Depression datasets:
datasets = [dep_tweet, dys_tweet, sad_tweet, pdd_tweet]
depression_tweets = pd.concat(datasets)

# Anxiety datasets:
datasets = [anx_tweet, gad_tweet, ocd_tweet, panic_tweet, phobia_tweet, ago_tweet]
anxiety_tweets = pd.concat(datasets)

# Test if it worked
print("The number of tweets in the depression set:",len(depression_tweets))
print("The number of tweets in the anxiety set:", len(anxiety_tweets))


The number of tweets in the depression set: 1921632
The number of tweets in the anxiety set: 2158123


#### 4.3 Get the depression_true tweet_ids

##### 4.3.1 Depression tweet-id's

In [24]:
# Get tweet ids
dep_tweet_ids = filtered_depression_cohort.index.tolist()
print(dep_tweet_ids[:10])

['tD0732736', 'tD0732737', 'tD0732738', 'tD0732739', 'tD0732740', 'tD0732741', 'tD0732742', 'tD0732743', 'tD0732744', 'tD0732745']


##### 4.3.2 Anxiety tweet-id's

In [25]:
# Get tweet ids
anx_tweet_ids = filtered_anxiety_cohort.index.tolist()
print(anx_tweet_ids[:10])

['tD2619064', 'tD2619065', 'tD2619066', 'tD2619067', 'tD2619068', 'tD2619069', 'tD2619070', 'tD2619071', 'tD2619072', 'tD2619073']


#### 4.4 Filter depression only tweets

In [26]:
# Filter the depression DataFrame to keep only users in depression_users_list
filtered_depression_tweets = depression_tweets[depression_tweets.index.isin(dep_tweet_ids)]

# Check the number of rows in the filtered DataFrame
print(f"Number of rows in the filtered DataFrame: {len(filtered_depression_tweets)}")

Number of rows in the filtered DataFrame: 1214351


#### 4.5 Filter anxiety only tweets

In [27]:
# Filter the anxiety DataFrame to keep only users in anxiety_users_list
filtered_anxiety_tweets = anxiety_tweets[anxiety_tweets.index.isin(anx_tweet_ids)]

# Check the number of rows in the filtered DataFrame
print(f"Number of rows in the filtered DataFrame: {len(filtered_anxiety_tweets)}")

Number of rows in the filtered DataFrame: 1223491


#### 4.6 Save tweet datasets

In [26]:
pd.DataFrame(filtered_depression_tweets).to_csv('../Data/Combined_sets/depression_tweets.tsv', sep="\t")
pd.DataFrame(filtered_anxiety_tweets).to_csv('../Data/Combined_sets/anxiety_tweets.tsv', sep="\t")