# Data Filtering and Preprocessing

This file takes the raw data for DASS and adds 5 new columns categorizing the data. Then, the data is filtered by certain conditions, and certain regions and unnecessary columns are dropped. 

# 1. Data Filtering

Import the necessary libraries and define helper functions to categorize country, age, and DASS-42 anxiety score into either string or numeric values. 

In [40]:
# Import libraries
import os
import json
import pandas as pd
import pycountry_convert as pc

In [41]:
# Define data folder
data_folder = "./data"

In [42]:
def encode_country(row):
    # Encode country into three major regions (east, west, other)
    country_code = row["country"]
    try:
        if country_code and country_code != "NONE":
            continent_name = pc.country_alpha2_to_continent_code(country_code)
            if continent_name == "AS":
                region_name = "east"
            elif continent_name in ["NA", "EU", "OC"]:
                region_name = "west"
            else:
                region_name = "other"
        else:
            region_name = ""
    except:
        region_name = ""
    return region_name

In [43]:
def encode_continent(row):
    # Encode country into three major regions (east, west, other)
    country_code = row["country"]
    try:
        if country_code and country_code != "NONE":
            continent_name = pc.country_alpha2_to_continent_code(country_code)
        else:
            continent_name = ""
    except:
        continent_name = ""
    return continent_name

In [44]:
def encode_age(row):
    # Encode age into groups
    age = int(row['age'])
    if age < 18:
        agegroup = 0
    elif age < 28:
        agegroup = 1
    elif age < 38:
        agegroup = 2
    elif age < 48:
        agegroup = 3
    elif age < 58:
        agegroup = 4
    elif age < 68:
        agegroup = 5
    else:
        agegroup = 6
    return agegroup

In [45]:
def calc_anx(row):
    # Calculate DASS-42 anxiety score
    with open(os.path.join(data_folder, "dass42_qcategories.json"), "r") as f:
        categories = json.load(f)
    anxiety_questions = [key for key in categories if categories[key] == "anxiety"]

    score = 0
    for qnum in anxiety_questions:
        score += int(row["Q{}A".format(qnum)])
    return score - len(anxiety_questions)

In [46]:
def categorize(row):
    # Classify as positive or negative (high or low) status based on threshold
    with open(os.path.join(data_folder, "dass42_scoring.json"), "r") as f:
        scoring = json.load(f)
    threshold = scoring["anxiety_score"]["severe"]["min"]  # moderate
    return (1 if row["anxiety_score"] >= threshold else 0)

Load the data and the helper functions will be called on the data to categorise the data into new columns.

In [47]:
# Load data
dataset = pd.read_csv("./DASS_data/data.csv", delimiter='\t', on_bad_lines='skip')

In [48]:
# Preprocess data
dataset["agegroup"] = dataset.apply(lambda row: encode_age(row), axis=1)
dataset["continent"] = dataset.apply(lambda row: encode_continent(row), axis=1)
dataset["region"] = dataset.apply(lambda row: encode_country(row), axis=1)
dataset["anxiety_score"] = dataset.apply(lambda row: calc_anx(row), axis=1)
dataset["anxiety_status"] = dataset.apply(lambda row: categorize(row), axis=1)

Run the cells below to see an overview of the data before filtering. 

In [49]:
# Data summary (before filtering)
print("Before filtering:")
print(dataset['gender'].value_counts())
print(dataset['age'].value_counts())
print(dataset['age'].mean(), dataset['age'].std())
print(dataset['continent'].value_counts())
print(dataset['region'].value_counts())
print(dataset['agegroup'].value_counts())
print(dataset['anxiety_status'].value_counts())

Before filtering:
gender
2    30367
1     8789
3      552
0       67
Name: count, dtype: int64
age
20      3789
21      3535
19      3510
18      3046
22      3009
        ... 
89         1
1996       1
223        1
78         1
99         1
Name: count, Length: 79, dtype: int64
23.612168447517284 21.581722299859113
continent
AS    24878
NA     9472
EU     3464
OC      828
        541
SA      351
AF      241
Name: count, dtype: int64
region
east     24878
west     13764
other      592
           541
Name: count, dtype: int64
agegroup
1    25285
0     7269
2     4421
3     1537
4      869
5      304
6       90
Name: count, dtype: int64
anxiety_status
1    20235
0    19540
Name: count, dtype: int64


In [50]:
# Data summary by continent (before filtering)
print("\nBreakdown by continent:")

print("\nContinent: Asia")
df1 = dataset[dataset['continent'] == 'AS']
print(df1['age'].mean(), df1['age'].std())
print(df1['gender'].value_counts())
print(df1['anxiety_status'].value_counts())

print("\nContinent: North America")
df2 = dataset[dataset['continent'] == 'NA']
print(df2['age'].mean(), df2['age'].std())
print(df2['gender'].value_counts())
print(df2['anxiety_status'].value_counts())

print("\nContinent: Europe")
df3 = dataset[dataset['continent'] == 'EU']
print(df3['age'].mean(), df3['age'].std())
print(df3['gender'].value_counts())
print(df3['anxiety_status'].value_counts())

print("\nContinent: South America")
df4 = dataset[dataset['continent'] == 'SA']
print(df4['age'].mean(), df4['age'].std())
print(df4['gender'].value_counts())
print(df4['anxiety_status'].value_counts())

print("\nContinent: Africa")
df5 = dataset[dataset['continent'] == 'AF']
print(df5['age'].mean(), df5['age'].std())
print(df5['gender'].value_counts())
print(df5['anxiety_status'].value_counts())

print("\nContinent: Oceania")
df6 = dataset[dataset['continent'] == 'OC']
print(df6['age'].mean(), df6['age'].std())
print(df6['gender'].value_counts())
print(df6['anxiety_status'].value_counts())


Breakdown by continent:

Continent: Asia
22.80794276067208 25.62313281199424
gender
2    20624
1     4170
3       49
0       35
Name: count, dtype: int64
anxiety_status
1    13057
0    11821
Name: count, dtype: int64

Continent: North America
24.8246410472973 12.45164961612597
gender
2    6267
1    2814
3     372
0      19
Name: count, dtype: int64
anxiety_status
0    4817
1    4655
Name: count, dtype: int64

Continent: Europe
25.09959584295612 11.212522384606052
gender
2    2137
1    1221
3     101
0       5
Name: count, dtype: int64
anxiety_status
0    1852
1    1612
Name: count, dtype: int64

Continent: South America
21.945868945868945 8.680679945384714
gender
2    185
1    156
3      9
0      1
Name: count, dtype: int64
anxiety_status
1    176
0    175
Name: count, dtype: int64

Continent: Africa
25.42738589211618 10.057538477246814
gender
2    181
1     55
3      3
0      2
Name: count, dtype: int64
anxiety_status
1    124
0    117
Name: count, dtype: int64

Continent: Oceania
27

In [51]:
num_samples = dataset.shape[0]
print("Number of samples:", num_samples)

Number of samples: 39775


Filter the data to keep data that contains: Males and Females, over 18, have a region.

Then, remove data with countries that have strict data privacy laws.

In [52]:
# Filter data
dataset = dataset.drop(dataset[(dataset['gender'] == 0) | (dataset['gender'] == 3)].index)  # Male and females only
dataset = dataset[dataset['age'] >= 18]  # Adults only
dataset = dataset[dataset['region'] != ""]  # Must have region

# Remove data with countries that have strict data privacy laws (for public use only)
dataset = dataset[dataset['continent'] != "EU"]
dataset = dataset[dataset['country'] != "CH"]
dataset = dataset[dataset['country'] != "IN"]
dataset = dataset[dataset['country'] != "JA"]
dataset = dataset[dataset['country'] != "AU"]

In [53]:
num_samples = dataset.shape[0]
print("Number of samples:", num_samples)

Number of samples: 28364


Run the cells below to see an overview of the data after filtering. 

In [38]:
# Data summary (after filtering)
print("\nAfter filtering:")
print(dataset['gender'].value_counts())
print(dataset['agegroup'].value_counts())
print(dataset['age'].mean(), dataset['age'].std())
print(dataset['continent'].value_counts())
print(dataset['region'].value_counts())
print(dataset['anxiety_status'].value_counts())


After filtering:
gender
2    22417
1     5947
Name: count, dtype: int64
agegroup
1    22615
2     3704
3     1131
4      619
5      229
6       66
Name: count, dtype: int64
24.983077140036666 24.768503249097243
continent
AS    21748
NA     6006
SA      245
AF      201
OC      164
Name: count, dtype: int64
region
east     21748
west      6170
other      446
Name: count, dtype: int64
anxiety_status
0    14657
1    13707
Name: count, dtype: int64


In [14]:
# Data summary by continent (after filtering)
print("\nBreakdown by continent:")

print("\nContinent: Asia")
df1 = dataset[dataset['continent'] == 'AS']
print(df1['age'].mean(), df1['age'].std())
print(df1['gender'].value_counts())
print(df1['anxiety_status'].value_counts())

print("\nContinent: North America")
df2 = dataset[dataset['continent'] == 'NA']
print(df2['age'].mean(), df2['age'].std())
print(df2['gender'].value_counts())
print(df2['anxiety_status'].value_counts())

print("\nContinent: Europe")
df3 = dataset[dataset['continent'] == 'EU']
print(df3['age'].mean(), df3['age'].std())
print(df3['gender'].value_counts())
print(df3['anxiety_status'].value_counts())

print("\nContinent: South America")
df4 = dataset[dataset['continent'] == 'SA']
print(df4['age'].mean(), df4['age'].std())
print(df4['gender'].value_counts())
print(df4['anxiety_status'].value_counts())

print("\nContinent: Africa")
df5 = dataset[dataset['continent'] == 'AF']
print(df5['age'].mean(), df5['age'].std())
print(df5['gender'].value_counts())
print(df5['anxiety_status'].value_counts())

print("\nContinent: Oceania")
df6 = dataset[dataset['continent'] == 'OC']
print(df6['age'].mean(), df6['age'].std())
print(df6['gender'].value_counts())
print(df6['anxiety_status'].value_counts())


Breakdown by continent:

Continent: Asia
23.287206266318538 5.7345204677729775
2    641
1    125
Name: gender, dtype: int64
1    384
0    382
Name: anxiety_status, dtype: int64

Continent: North America
30.643192488262912 13.338284137107273
2    134
1     79
Name: gender, dtype: int64
0    144
1     69
Name: anxiety_status, dtype: int64

Continent: Europe
nan nan
Series([], Name: gender, dtype: int64)
Series([], Name: anxiety_status, dtype: int64)

Continent: South America
21.1875 3.331040878364199
2    8
1    8
Name: gender, dtype: int64
0    9
1    7
Name: anxiety_status, dtype: int64

Continent: Africa
33.0 nan
2    1
Name: gender, dtype: int64
0    1
Name: anxiety_status, dtype: int64

Continent: Oceania
40.0 15.033296378372908
2    3
1    1
Name: gender, dtype: int64
0    4
Name: anxiety_status, dtype: int64


Keep the columns that are necessary for DASS.

In [15]:
# Drop unnecessary columns
# to_drop = ["source", "screensize", "uniquenetworklocation", 
#             "education", "urban", "engnat", "hand", "religion", 
#             "orientation", "race", "voted", "married", "major",
#             "introelapse", "testelapse", "surveyelapse", "familysize"]
# dataset = dataset.drop(to_drop, axis=1)

for col in dataset.columns:
    if "TIPI" in col or "VCL" in col:
        dataset = dataset.drop([col], axis=1)
    elif col[0] == "Q" and (col[-1] == "E" or col[-1] == "I"):
        dataset = dataset.drop([col], axis=1)

Save the filtered dataset under data folder.

In [16]:
# Saved filtered dataset
dataset.to_csv(os.path.join(data_folder, "data_filtered.csv"), index=None)

# 2. Data Preprocessing

Import the necessary libraries and define constants.

In [17]:
# Import libraries
import numpy as np
import os
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [18]:
# Define data folder
seed = 42
data_folder = "./data"

Define the preprocess function which will rebalance the minority class, one-hot encoding categorical variables, and z-score normalizing the numerical features.

Then, define the train_val_test_split function which will split and label the data into training, validation, and test datasets, which would be used for training the model, tuning model parameters, and evaluating the model's performance. 

In [19]:
def preprocess(data_df):
    """
    Pre-processing: rebalance, one-hot encode, normalize
    """
    # Separate majority and minority classes
    df_majority = data_df[data_df["anxiety_status"] == 1]
    df_minority = data_df[data_df["anxiety_status"] == 0]
    
    # Upsample minority class
    data_minority = resample(df_minority, 
                            replace=True,                       # sample with replacement
                            n_samples=len(df_majority.index),   # to match majority class
                            random_state=123)                   # reproducible results

    data_df = pd.concat([df_majority, data_minority])
    data_df = data_df.reset_index(drop=True)

    # Extract the label columns; separate features and labels
    labels_df = data_df[["anxiety_status"]].copy()
    feats_df = data_df.drop(["anxiety_score", "anxiety_status"], axis=1)

    # z-score normalization
    def z_score_norm(row, col, mean, stdev):
        z_score = (float(row[col]) - mean) / stdev
        return float(z_score)

    # One-hot encode gender and region
    label_encoder = LabelEncoder()
    oneh_encoder = OneHotEncoder()

    # Gender
    gender = label_encoder.fit_transform(feats_df["gender"])
    gender = pd.DataFrame(gender)
    gender = pd.DataFrame(oneh_encoder.fit_transform(gender).toarray())
    gender.columns = ["gender_m", "gender_f"]

    # Region
    region = label_encoder.fit_transform(feats_df["region"])
    region = pd.DataFrame(region)
    region = pd.DataFrame(oneh_encoder.fit_transform(region).toarray())
    region.columns = ["region_other", "region_east", "region_west"]

    # Combine and remove original columns
    feats_df = feats_df.drop(["gender", "country", "region", "agegroup", "continent"], axis=1)
    feats_df = pd.concat([feats_df, gender, region], axis=1)

    # One-hot encode question answers
    for col in feats_df.columns:
        if col[0] == "Q" and col[-1] == "A":
            temp = label_encoder.fit_transform(feats_df[col])
            temp = pd.DataFrame(temp)
            temp = pd.DataFrame(oneh_encoder.fit_transform(temp).toarray())

            col_names = []
            for c in temp.columns:
                col_names.append("{0}_{1}".format(col, c))
            temp.columns = col_names

            feats_df = feats_df.drop([col], axis=1)
            feats_df = pd.concat([feats_df, temp], axis=1)

    # Normalize numerical columns (Use z-score)
    mean = feats_df["age"].mean()
    stdev = feats_df["age"].std()
    feats_df["age_norm"] = feats_df.apply(
                    lambda row: z_score_norm(row, "age", mean, stdev), axis=1)
    feats_df = feats_df.drop(["age"], axis=1)

    return feats_df, labels_df

In [20]:
def train_val_test_split(feats_df, labels_df, rand=0, save=False):
    """
    Train / validation / test (holdout) dataset split
    """
    feats_arr = np.array(feats_df)
    labels_arr = np.array(labels_df)
    traintest_feats, valid_feats, traintest_labels, valid_labels = \
        train_test_split(feats_arr, labels_arr, test_size=0.10, random_state=seed)
    train_feats, holdout_feats, train_labels, holdout_labels = \
        train_test_split(traintest_feats, traintest_labels, test_size=0.1111, random_state=rand)

    train_feats = train_feats.astype(float)
    train_labels = train_labels.astype(float)
    valid_feats = valid_feats.astype(float)
    valid_labels = valid_labels.astype(float)
    holdout_feats = holdout_feats.astype(float)
    holdout_labels = holdout_labels.astype(float)

    if save:
        train_feats.to_csv(os.path.join(data_folder, "train_feats.csv"), index=None)
        train_labels.to_csv(os.path.join(data_folder, "train_labels.csv"), index=None)
        valid_feats.to_csv(os.path.join(data_folder, "valid_feats.csv"), index=None)
        valid_labels.to_csv(os.path.join(data_folder, "valid_labels.csv"), index=None)
        holdout_feats.to_csv(os.path.join(data_folder, "holdout_feats.csv"), index=None)
        holdout_labels.to_csv(os.path.join(data_folder, "holdout_labels.csv"), index=None)

    return train_feats, train_labels, valid_feats, valid_labels, holdout_feats, holdout_labels

Preprocess the filtered data by calling the preprocess function, and save the preprocessed data in the data_folder.

In [21]:
# Import dataset
data = pd.read_csv(os.path.join(data_folder, "data_filtered_1000.csv")) # Using the sample dataset. Change the file name accordingly if using another dataset. 

In [22]:
# Split into features and labels, save as CSV
feats_df, labels_df = preprocess(data)

feats_df.to_csv(os.path.join(data_folder, "features.csv"), index=None)
labels_df.to_csv(os.path.join(data_folder, "labels.csv"), index=None)