# Demographics preprocessing

In [1]:
# Import libraries
import pandas as pd

## 1. Load in the multi index demographics files

Set preliminaries

In [3]:
# Set parameters
_GEN = 0.8
_AGE = 0.6

# Display requisites
pd.set_option('display.max_columns', None)

In [4]:
d_demo = pd.read_csv('../Data/Combined_sets/depression_demographics.tsv', header=[0,1], index_col=[0], sep='\t')
a_demo = pd.read_csv('../Data/Combined_sets/anxiety_demographics.tsv', header=[0,1], index_col=[0], sep='\t')

## 2. Preprocessing steps

### 2.1 Drop unneccesary columns

Those are the columns:

* diagnosis

* botometer

* org

In [5]:
# Depression
d_demo = d_demo.drop(columns=['diagnosis', 'botometer', 'org'])

# Anxiety
a_demo = a_demo.drop(columns=['diagnosis', 'botometer', 'org'])

  d_demo = d_demo.drop(columns=['diagnosis', 'botometer', 'org'])
  a_demo = a_demo.drop(columns=['diagnosis', 'botometer', 'org'])


### 2.2 Get boolean values for age and gender using paramater thresholds

#### 2.2.1 Age

In [None]:
# This creates a new dataframe for all values under the age MultiIndex over the chosen threshold

# depression
d_age = d_demo.loc[:, pd.IndexSlice["age", :]] >= _AGE

# anxiety
a_age = a_demo.loc[:, pd.IndexSlice["age", :]] >= _AGE

In [8]:
# Let's see how many people we have in each category:

print("Depression ages distribution:")
print(d_age.sum())

print("Anxiety ages distribution:")
print(a_age.sum())

Depression ages distribution:
age  19-29    226
     30-39     88
     <=18     218
     >=40      70
dtype: int64
Anxiety ages distribution:
age  19-29    240
     30-39     96
     <=18     198
     >=40      55
dtype: int64


#### 2.2.2 Gender

In [9]:
# Depression cohort
d_gender = d_demo.loc[:, pd.IndexSlice["gender", :]] >= _GEN

# Anxiety cohort
a_gender = a_demo.loc[:, pd.IndexSlice["gender", :]] >= _GEN

In [10]:
# Let's see how many people we have in each category:
print("Depression ages distribution:")
print(d_gender.sum())

print("Anxiety ages distribution:")
print(a_gender.sum())

Depression ages distribution:
gender  male      271
        female    490
dtype: int64
Anxiety ages distribution:
gender  male      209
        female    527
dtype: int64


In [11]:
print("male:", 271 + 209)
print("female:", 490 + 527)

male: 480
female: 1017


#### 2.2.3 How many unknowns?

### 3. Create new DataFrame with boolean age and gender values

In [13]:
# DEPRESSION____________________________________________________________________
# Create boolean columns for gender based on the threshold
d_demo[('gender', 'male')] = d_demo.loc[:, ('gender', 'male')] >= _GEN
d_demo[('gender', 'female')] = d_demo.loc[:, ('gender', 'female')] >= _GEN

# Create boolean columns for age based on the threshold
d_demo[('age', '19-29')] = d_demo.loc[:, ('age', '19-29')] >= _AGE
d_demo[('age', '30-39')] = d_demo.loc[:, ('age', '30-39')] >= _AGE
d_demo[('age', '<=18')] = d_demo.loc[:, ('age', '<=18')] >= _AGE
d_demo[('age', '>=40')] = d_demo.loc[:, ('age', '>=40')] >= _AGE

# Check it
print(d_demo.head())

# ANXIETY_______________________________________________________________________
# Create boolean columns for gender based on the threshold
a_demo[('gender', 'male')] = a_demo.loc[:, ('gender', 'male')] >= _GEN
a_demo[('gender', 'female')] = a_demo.loc[:, ('gender', 'female')] >= _GEN

# Create boolean columns for age based on the threshold
a_demo[('age', '19-29')] = a_demo.loc[:, ('age', '19-29')] >= _AGE
a_demo[('age', '30-39')] = a_demo.loc[:, ('age', '30-39')] >= _AGE
a_demo[('age', '<=18')] = a_demo.loc[:, ('age', '<=18')] >= _AGE
a_demo[('age', '>=40')] = a_demo.loc[:, ('age', '>=40')] >= _AGE

# Check it
print(a_demo.head())

        gender           age                     
          male female  19-29  30-39   <=18   >=40
user_id                                          
uDEP000   True  False  False  False  False  False
uDEP001   True  False  False  False  False   True
uDEP002   True  False  False  False  False   True
uDEP004  False  False  False  False   True  False
uDEP005  False   True  False  False  False   True
        gender           age                     
          male female  19-29  30-39   <=18   >=40
user_id                                          
uANX000   True  False  False  False  False   True
uANX002  False   True  False  False  False  False
uANX003   True  False  False   True  False  False
uANX004  False   True   True  False  False  False
uANX009   True  False  False  False  False  False


### 4. Save boolean dataframe to a new file

In [16]:
d_demo.to_csv('../Data/combined_sets/d_processed_demographics.tsv', sep='\t')
a_demo.to_csv('../Data/combined_sets/a_processed_demographics.tsv', sep='\t')