### 1. Demographics data (dm.csv) [one record per subject]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load demographics .csv
file_path = 'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/MSOAC Placebo dataset/csv files/dm.csv'

# create data frame
demographics = pd.read_csv(file_path)
demographics

In [None]:
unique_count = demographics['USUBJID'].nunique()
print(f"Number of (unique) patients in this dataset: {unique_count}") #all patients in the dataset

Check how many missing values we have per column

In [None]:
missing_percentage = (demographics.isnull().sum() / len(demographics)) * 100
missing_demographics = pd.DataFrame({'Column Name': missing_percentage.index, 'Missing Percentage': missing_percentage.values})
#missing_demographics = missing_demographics.sort_values(by='Missing Percentage', ascending=False)
print(missing_demographics)

We will drop the columns with more than 85% missing values, namely:
- **STUDYID**: Study Identifier (irrelevant)
- **DOMAIN**: Domain Abbreviation (irrelevant)
- **SUBJID**: Subject Identifier for the Study (same as USUBJID)
- **RFSTDTC, RFENDTC, DTHDTC, DTHFL, SITEID, INVID, INVNAM, BRTHDTC, DMDTC, DMDY, DMENDY, DMDTC_TS, RFENDTC_TS, RFSTDTC_TS** (all missing and not even mentioned in the dictionary)
- **AGEU**: Age Units (NA when AGE missing or YEARS - irrelevant)
- **ACTARMCD**: Actual Arm Code (only NA or 1 - seems irrelevant and same info as ACTARM)
- **ACTARM**: Description of Actual Arm (only NA or PLACEBO - seems irrelevant)
- **ARMCD**: Planned Arm Code (only 1 - irrelevant and same info as ARM)
- **ARM**: Description of Planned Arm (only PLACEBO - irrelevant)
- **ETHNIC**: Ethnicity (too many missing values and *already included in RACE*) - HISPANIC OR LATINO or NOT HISPANIC OR LATINO

In [None]:
columns_to_drop = ['STUDYID','DOMAIN','SUBJID','RFSTDTC','RFENDTC','DTHDTC','DTHFL','SITEID','INVID','INVNAM','BRTHDTC','AGEU','ETHNIC','ARMCD','ARM','ACTARMCD','ACTARM','DMDTC','DMDY','DMENDY','DMDTC_TS','RFENDTC_TS','RFSTDTC_TS']
demographics = demographics.drop(columns_to_drop, axis=1)
demographics

- Descriptive statistics for continuous variables (in this case, just age)

In [None]:
continuous_columns = ['AGE']

descriptive_continuous = {
    'Count': demographics[continuous_columns].count(), #cases that are not missing
    'Missing Cases': demographics[continuous_columns].isna().sum(),
    'Mean': demographics[continuous_columns].mean(),
    'Standard Deviation': demographics[continuous_columns].std()
}

cont_demographics = pd.DataFrame(descriptive_continuous)

print(cont_demographics)

In [None]:
age = demographics['AGE'].dropna()

plt.figure(figsize=(8, 6))
sns.histplot(age, kde=True, color='red', bins=30)
plt.xlabel('Age')
plt.ylabel('Density')
plt.title('Histogram with Density Line for Age')
plt.show() # more or less normaly distributed

In [None]:
summary_stats = age.describe()
summary_stats

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(y=age, color='skyblue')
plt.ylabel('Age')
plt.title('Boxplot for Age')
plt.show()

- Descriptive statistics for categorical variables (in this case, gender, race and country)

In [None]:
categorical_columns = ['SEX', 'RACE','COUNTRY']

descriptive_categorical = {}
for col in categorical_columns:
    descriptive_categorical[col] = {
        'Count': demographics[col].count(),
        'Missing Cases': demographics[col].isna().sum(),
        'Unique Values': demographics[col].nunique(),
        'Mode': demographics[col].mode().values[0],
        'Mode Frequency': demographics[col].value_counts().max()
    }

cat_demographics = pd.DataFrame(descriptive_categorical).T
print(cat_demographics)

- Number of observations for each RACE category

In [None]:
race_counts_demographics = demographics['RACE'].value_counts().reset_index()
race_counts_demographics.columns = ['Race', 'Count']
total_count = race_counts_demographics['Count'].sum()
race_counts_demographics['Percentage'] = (race_counts_demographics['Count'] / total_count) * 100

print(race_counts_demographics)

In [None]:
# Pie chart - dropped NA for now
categories = demographics['RACE'].dropna().unique()
category_counts = demographics['RACE'].dropna().value_counts()

plt.pie(category_counts, labels=categories, autopct='%1.1f%%', startangle=90)
plt.legend(labels=categories, loc='lower right', bbox_to_anchor=(1.2, 1))
plt.axis('equal')
plt.show()

In [None]:
race = demographics['RACE'].dropna()

plt.figure(figsize=(8, 6))
race.value_counts().plot(kind='bar', color='green')
plt.xlabel('Race')
plt.ylabel('Count')
plt.title('Bar Chart for Race')
plt.show()

- Number of observations for each SEX category

In [None]:
sex_counts_demographics = demographics['SEX'].value_counts().reset_index()
sex_counts_demographics.columns = ['Gender', 'Count']
total_count = sex_counts_demographics['Count'].sum()
sex_counts_demographics['Percentage'] = (sex_counts_demographics['Count'] / total_count) * 100

print(sex_counts_demographics)

In [None]:
# Pie chart - dropped NA for now
categories = demographics['SEX'].dropna().unique()
category_counts = demographics['SEX'].dropna().value_counts()

plt.pie(category_counts, labels=categories, autopct='%1.1f%%', startangle=90)
plt.legend(labels=categories, loc='lower right', bbox_to_anchor=(1.2, 1))
plt.axis('equal')
plt.show()

In [None]:
sex = demographics['SEX'].dropna()

plt.figure(figsize=(8, 6))
sex.value_counts().plot(kind='bar', color='green')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Bar Chart for Sex')
plt.show()

- Number of observations for each COUNTRY category

In [None]:
country_counts_demographics = demographics['COUNTRY'].value_counts().reset_index()
country_counts_demographics.columns = ['Country', 'Count']
total_count = country_counts_demographics['Count'].sum()
country_counts_demographics['Percentage'] = (country_counts_demographics['Count'] / total_count) * 100

print(country_counts_demographics)

In [None]:
country = demographics['COUNTRY'].dropna()

plt.figure(figsize=(8, 6))
country.value_counts().plot(kind='bar', color='green')
plt.xlabel('Country')
plt.ylabel('Count')
plt.title('Bar Chart for Country')
plt.show()

#### *Ideas*:
- Impute age with mean (only around 3% missing)
- Is country important for prognosis? If not, drop. If yes, what do to regarding missing values? 
- COUNTRY variable (if used): should we group by continent? - it has way too many countries
- RACE variable is highly imbalanced - maybe use just two categories (white / non-white)?