In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading csv file
df = pd.read_csv('../Datasets/data.csv')

In [None]:
# Size of dataset (rows, columns)
df.shape

In [None]:
# Columns
df.columns

In [None]:
# Datatypes of columns
df.dtypes

In [None]:
# Checking for null values
df.isnull().sum()

In [None]:
# Dropping unnamed columns
df = df.iloc[:, :172]

In [None]:
# Dropping position and time columns
for i in range(1, 43):
    pos = 'Q' + str(i) + 'I'
    time = 'Q' + str(i) + 'E'
    df.drop([pos, time], axis=1, inplace=True)

In [None]:
# Dropping unnecessary columns
drop_columns = ['engnat', 'hand', 'religion', 'orientation', 'race', 'voted', 'married', 'major', 'country', 'screensize', 'uniquenetworklocation', 'source', 'introelapse', 'testelapse', 'surveyelapse']
df.drop(drop_columns, axis=1, inplace=True)

In [None]:
# Dropping remaining questions and reordering according to DASS21 scale
Q_to_drop = ['1', '5', '7', '9', '11', '12', '15', '16', '18', '19', '21', '22', '23', '24', '26', '27', '30', '34', '35', '36', '37']
for i in Q_to_drop:
    d = 'Q' + i + 'A'
    df.drop([d], axis=1, inplace=True)

In [None]:
# Mapping questions
q1 = [29, 2, 3, 4, 42, 6, 41, 33, 40, 10, 39, 8, 13, 32, 28, 31, 17, 14, 25, 20, 38]
q_map = {str(key): str(i + 1) for i, key in enumerate(q1)}
dass = ['S', 'A', 'D', 'A', 'D', 'S', 'A', 'S', 'A', 'D', 'S', 'S', 'D', 'S', 'A', 'D', 'D', 'S', 'A', 'A', 'D']
for (key, value), d in zip(q_map.items(), dass):
    old = 'Q' + key + 'A'
    new = 'Q' + value + '(' + d + ')'
    df.rename(columns={old: new}, inplace=True)
    df[new].replace([1, 2, 3, 4], [0, 1, 2, 3], inplace=True)

In [None]:
# Reindexing columns
df = df.reindex(columns=[col for col in df.columns if col in q_map.values()] + [col for col in df.columns if col not in q_map.values()])

In [None]:
# Calculating scores
df['Str'] = df[[col for col in df.columns if '(S)' in col]].sum(axis=1) * 2
df['Anx'] = df[[col for col in df.columns if '(A)' in col]].sum(axis=1) * 2
df['Dep'] = df[[col for col in df.columns if '(D)' in col]].sum(axis=1) * 2

In [None]:
# Filtering using VCL6, VCL9, VCL12
df = df[df['VCL6'] < 1]
df = df[df['VCL9'] < 1]
df = df[df['VCL12'] < 1]

In [None]:
# Categorizing scores
classes = ['Normal', 'Mild', 'Moderate', 'Severe', 'Extremely Severe']
df['Depression'] = pd.cut(df['Dep'], bins=[-1, 9, 13, 20, 27, 42], labels=classes)
df['Anxiety'] = pd.cut(df['Anx'], bins=[-1, 7, 9, 14, 19, 42], labels=classes)
df['Stress'] = pd.cut(df['Str'], bins=[-1, 14, 18, 25, 33, 42], labels=classes)

In [None]:
# Replacing categorical values with numeric
scale_mapper = {'Normal': 0, 'Mild': 1, 'Moderate': 2, 'Severe': 3, 'Extremely Severe': 4}
for col in ['Depression', 'Anxiety', 'Stress']:
    df[col] = df[col].replace(scale_mapper)

In [None]:
# Saving preprocessed dataframe
df.to_csv('../Datasets/CleanData.csv', index=False)

In [None]:
# Dropping VCL columns
for i in range(1, 17):
    df.drop(['VCL' + str(i)], axis=1, inplace=True)

In [None]:
# Filtering age
df = df[(df['age'] >= 18) & (df['age'] < 85)]

In [None]:
# Filtering family size
df = df[df['familysize'] <= 15]

In [None]:
# Separating datasets for Depression, Anxiety, and Stress
df_dep = df[[col for col in df.columns if '(D)' in col] + ['Depression']]
df_anx = df[[col for col in df.columns if '(A)' in col] + ['Anxiety']]
df_str = df[[col for col in df.columns if '(S)' in col] + ['Stress']]

In [None]:
# Saving separate datasets
df_dep.to_csv('../Datasets/Depression.csv', index=False)
df_anx.to_csv('../Datasets/Anxiety.csv', index=False)
df_str.to_csv('../Datasets/Stress.csv', index=False)

In [None]:
# Categorizing family size
family_classes = ['Nuclear', 'Joint', 'Extended']
df['family'] = pd.cut(df['familysize'], bins=[-1, 5, 10, 15], labels=family_classes)

In [None]:
# Encoding family
from sklearn import preprocessing
label_enc = preprocessing.LabelEncoder()
df['family_enc'] = label_enc.fit_transform(df['family'])

In [None]:
# Categorizing age
age_classes = ['18-25', '26-35', '36-45', '46-60', '60+']
df['age class'] = pd.cut(df['age'], bins=[-17, 25, 35, 45, 60, 82], labels=age_classes)

In [None]:
# Encoding age
df['age_enc'] = label_enc.fit_transform(df['age class'])

In [None]:
# Saving final preprocessed dataframe
df.to_csv('../Datasets/FinalCleanData.csv', index=False)