In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

In [None]:
data = pd.read_csv('../Data/CVD_cleaned.csv')

In [None]:
display(data.head())

## Visualise the entire data, with one singular plot per column in the dataset:

In [None]:
def plot_data_columns(data: pd.DataFrame) -> None:
    """Plot the columns of a dataframe, where every
    column is plotted as a single histogram or bar plot.
    A column with only 0 or 1 as its values is plotted
    as a bar plot and will get ticks 'Yes' or 'No'.

    Args:
        data (pd.DataFrame): any pandas dataframe.
    """
    plots_per_row = 3
    num_cols = len(data.columns)
    num_rows = np.ceil(num_cols / plots_per_row).astype(int)

    # Figsize is hand picked
    fig = plt.figure(figsize=(15, 8 * num_rows))

    for i, col in enumerate(data.columns):
        ax = fig.add_subplot(num_rows, plots_per_row, i + 1)

        # If everything is numbers, we make a histogram
        if all([isinstance(x, (int, float)) for x in data[col]]):
            ax.hist(data[col], bins=30, edgecolor='k', color='c')
            m = np.mean(data[col])
            s = np.std(data[col])
            ax.axvline(m, color='red', linestyle='--', label=fr'Mean $\mu$')
            ax.axvline(m + s, color='green', linestyle='--', label=fr'Mean $\mu \mp $ std $\sigma$')
            ax.axvline(m - s, color='green', linestyle='--')
            ax.legend()
            
        else:
        # Otherwise it must be a bar plot
            ax.bar(data[col].unique(), data[col].value_counts(), edgecolor='k', color='c')
            # Check if only 0 and 1, then we must change the ticks
            if all([x in [0, 1] for x in data[col]]):
                ax.set_xticks([0, 1])
                ax.set_xticklabels(['No', 'Yes'])
        
        ax.tick_params(axis='x', rotation=90)
        ax.set_title(col)

    plt.tight_layout()
    plt.show()

plot_data_columns(data)

## Now visualise the data for only females:

In [None]:
plot_data_columns(data.loc[data["Sex"] == "Female"])

## Now visualise the data for only males:

In [None]:
plot_data_columns(data.loc[data["Sex"] == "Male"])

## The correlation matrix for our dataset:

In [None]:
# For the correlation we can only take columns with numbers
data_num = data.select_dtypes(include=[int, float])
corr = data_num.corr()

plt.imshow(corr, cmap='viridis')
plt.colorbar()
plt.xticks(np.arange(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(np.arange(len(corr.columns)), corr.columns)
plt.title('Correlation matrix for the dataset')
plt.show()

In [None]:
# reformatting the data to be suitable for logistic regression
# transforming categorical variables (Yes=1, No=0)
# transforming 'Sex' variable to binary 'Sex_Male' variable
data = data.replace({'Skin_Cancer':'Yes', 'Other_Cancer': 'Yes', 'Heart_Disease': 'Yes', 'Depression': 'Yes', 'Smoking_History': 'Yes', 'Exercise': 'Yes'
                     , 'Sex': 'Male', 'Arthritis': 'Yes'}, 1).replace({'Skin_Cancer':'No', 'Other_Cancer': 'No', 'Heart_Disease': 'No'
                                                   , 'Depression': 'No', 'Smoking_History': 'No', 'Exercise': 'No'
                                                   , 'Sex': 'Female', 'Arthritis': 'No'}, 0).rename(columns={'Sex':'Sex_Male'})

In [None]:
data.head()

In [None]:
print(data['Age_Category'].unique())

In [None]:
print(data['Diabetes'].unique())

In [None]:
# normalize numerical values
def normalize_data(data):
    """
    This function takes a column of data and normalizes it.
    """
    min_value = min(data)
    max_value = max(data)
    normalized_data = []

    for value in data:
        new_value = (value - min_value) / (max_value - min_value)
        normalized_data.append(new_value)

    return normalized_data

#normalize height
height_data = data['Height_(cm)']
normalized_height = normalize_data(height_data)

# normalize weight
weight_data = data['Weight_(kg)']
normalized_weight = normalize_data(weight_data)

# normalize bmi
bmi_data = data['BMI']
normalized_bmi = normalize_data(bmi_data)

# replace data for normalized data
data['Height_(cm)'] = normalized_height
data['Weight_(kg)'] = normalized_weight
data['BMI'] = normalized_bmi

data.head(10)


In [None]:
# TODO
# reformat multi-categorical variables:
# - Age (now needs to be normalised)
# - Diabetes (check if dropping all other options than 'Yes' and 'No' can be justified) 
# - General Health
# - Checkup
# either drop the consumption categories or find out what the values mean

# set age to numerical by taking the mean of the existing categories (ASSUMPTION: 80+ is estimated to 85)
age_mapping = {'18-24': 21.0, '25-29': 27.0, '30-34': 32.0, '35-39': 37.0,'40-44': 42.0, '45-49': 47.0, '50-54': 52.0, 
               '55-59': 57.0,'60-64': 62.0, '65-69': 67.0, '70-74': 72.0, '75-79': 77.0, '80+': 85.0}

# apply the mapping
data['Age_Category'] = data['Age_Category'].map(age_mapping)
data = data.rename(columns={'Age_Category':'Age_Numeric'})

# normalise the age
min_age = min(data['Age_Numeric'])
max_age = max(data['Age_Numeric'])

normalised_age = [(i - min_age) / (max_age - min_age) for i in data['Age_Numeric']]

data['Age_Numeric'] = normalised_age
data = data.rename(columns={'Age_Numeric':'Age_Normalised'})

# diabetes transformation
# group diabetics and pre-diabetics together, group non-diabetics and gestational diabetics together
diabetes_binary = [1 if i == 'Yes' or i == 'No, pre-diabetes or borderline diabetes' else 0 for i in data['Diabetes']]


In [None]:
# TODO
# calculate/plot correlation coefficients

In [None]:
Y = data['Heart_Disease']
X = data[['BMI', 'Weight_(kg)', 'Height_(cm)', 'Exercise', 'Depression', 'Smoking_History', 'Sex_Male', 'Skin_Cancer', 'Other_Cancer']]

In [None]:
model = sm.Logit(Y, X).fit()

In [None]:
model.summary()