# Exploratory Data Analysis

Libraries

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import learning_curve
import joblib 
import seaborn as sns 
import warnings 
import matplotlib.pyplot as plt 

warnings.filterwarnings("ignore")

Load the Dataset

In [None]:
file_path = "hypertension_dataset.csv"
df = pd.read_csv(file_path)

Describe the dataset

In [None]:
df.describe()

In [None]:
df.info()

Checking Data Types

In [None]:
df.dtypes

Implementing One Hot Coding to Categorical Values

In [None]:
# Define mapping for categorical columns
category_mappings = {
    'Smoking_Status': {'Never': 0, 'Current': 1},
    'Physical_Activity_Level': {'Low': 0, 'Moderate': 1, 'High': 2},
    'Family_History': {'No': 0, 'Yes': 1},
    'Gender': {'Female': 0, 'Male': 1},
    'Education_Level': {'Primary': 0, 'Secondary': 1, 'Tertiary': 2},
    'Employment_Status': {'Unemployed': 0, 'Employed': 1, 'Retired': 2},
    'Hypertension': {'Low': 0, 'High': 1}
}

# Apply the mappings
df_mapped = df.copy()

for column, mapping in category_mappings.items():
    if column in df_mapped.columns:
        df_mapped[column] = df_mapped[column].map(mapping)

# Display the mapped dataframe
df_mapped.head()


Checking Column Names

In [None]:
df.columns

Checking for Unique Values

In [None]:
df.nunique()

Checking for Missing Values

In [None]:
missing_values = df.isnull().sum()
missing_values

Checking for Duplicate Values

In [None]:
duplicated_values = df.duplicated().sum()
duplicated_values

df = df.drop_duplicates()


Checking for Outliers

In [None]:
# Visualize outliers using boxplots
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

In [None]:
print("Proportion of Hypertension Values")
df['Hypertension'].value_counts(normalize=True)


Feature Extraction

In [None]:
# 1. Create Age Grouping (bins)
bins = [0, 30, 45, 60, 100]
labels = ['Young', 'Middle-Aged', 'Senior', 'Very Senior']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

# 2. Create BMI Categories (bins)
bmi_bins = [0, 18.5, 24.9, 29.9, 40, 100]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese', 'Very Obese']
df['BMI_Category'] = pd.cut(df['BMI'], bins=bmi_bins, labels=bmi_labels)

# 3. Create Cholesterol to Age Ratio
df['Cholesterol_to_Age'] = df['Cholesterol'] / df['Age']

# 4. Interaction Features (Systolic_BP * Diastolic_BP)
df['BP_Interaction'] = df['Systolic_BP'] * df['Diastolic_BP']

# 5. Interaction Feature (BMI * Cholesterol)
df['BMI_Cholesterol_Interaction'] = df['BMI'] * df['Cholesterol']

# 6. Polynomial Features for Cholesterol (degree 2)
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
cholesterol_poly = poly.fit_transform(df[['Cholesterol']])

# Convert to DataFrame and add it to the dataset
cholesterol_poly_df = pd.DataFrame(cholesterol_poly, columns=poly.get_feature_names_out(['Cholesterol']))
df = pd.concat([df, cholesterol_poly_df], axis=1)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


Correlation Analysis

In [None]:
# Select only numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation_matrix = numerical_columns.corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Training the Data


#### Splitting the Data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = df.drop('Hypertension', axis=1)  # Drop the target variable
y = df['Hypertension']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Standardize Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Select only numerical columns
numerical_columns = X_train.select_dtypes(include=['float64', 'int64']).columns

# Standardize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])
X_test_scaled = scaler.transform(X_test[numerical_columns])

# If you need to reattach the non-numeric columns (e.g., categorical) to the scaled data:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_columns, index=X_test.index)

# If you want to combine with non-numeric columns later (e.g., one-hot encoded columns), you can do this:
X_train_final = pd.concat([X_train_scaled_df, X_train.drop(columns=numerical_columns)], axis=1)
X_test_final = pd.concat([X_test_scaled_df, X_test.drop(columns=numerical_columns)], axis=1)

#### Perform Chi-Square Test (Categorical)

In [53]:
from scipy.stats import chi2_contingency

# Iterate over each column in the dataset
for column in df.columns:
    if column != 'Hypertension':  # Skip the target variable itself
        # Check if the column is categorical
        if df[column].apply(type).iloc[0] == str:  # Check if the column is of string type (categorical data)
            # Create a contingency table for the categorical feature and the target variable
            contingency_table = pd.crosstab(df[column], df['Hypertension'])

            # Perform Chi-Square test
            chi2, p, dof, expected = chi2_contingency(contingency_table)

            # Check p-value to determine if the feature is statistically significant
            if p < 0.05:
                print(f"The feature '{column}' is important (p < 0.05).")
            else:
                print(f"The feature '{column}' is not significant (p >= 0.05).")

The feature 'Country' is not significant (p >= 0.05).
The feature 'Smoking_Status' is not significant (p >= 0.05).
The feature 'Physical_Activity_Level' is not significant (p >= 0.05).
The feature 'Family_History' is not significant (p >= 0.05).
The feature 'Diabetes' is not significant (p >= 0.05).
The feature 'Gender' is not significant (p >= 0.05).
The feature 'Education_Level' is not significant (p >= 0.05).
The feature 'Employment_Status' is not significant (p >= 0.05).
The feature 'Age_Group' is not significant (p >= 0.05).
The feature 'BMI_Category' is not significant (p >= 0.05).


####  Perform Statistical Tests (Continuous)

In [None]:
from scipy.stats import f_oneway, kruskal

# Function to perform ANOVA for continuous variables
def anova_test(column, target):
    # Group data by target categories
    groups = [column[target == category] for category in target.unique()]
    
    # Perform ANOVA (F-test)
    stat, p = f_oneway(*groups)
    
    # Return the p-value
    return p

# Function to perform Kruskal-Wallis for non-normally distributed continuous variables
def kruskal_test(column, target):
    # Group data by target categories
    groups = [column[target == category] for category in target.unique()]
    
    # Perform Kruskal-Wallis test
    stat, p = kruskal(*groups)
    
    # Return the p-value
    return p

# Iterate over each column to check for continuous variables
for column in df.select_dtypes(include=['float64', 'int64']).columns:  # Select only continuous columns
    if column != 'Hypertension':  # Skip the target variable
        # Perform ANOVA or Kruskal-Wallis test based on the distribution
        p_value = None
        skew_value = df[column].skew()  # Get the skewness of the column
        
        # Ensure skew_value is scalar (only one value for each column)
        if isinstance(skew_value, float):  # Check that skew_value is a scalar
            # Check if skewness is less than 1, indicating normal distribution
            if abs(skew_value) < 1:  # If skew is not high, assume normal distribution for ANOVA
                p_value = anova_test(df[column], df['Hypertension'])
                test_type = "ANOVA"
            else:
                p_value = kruskal_test(df[column], df['Hypertension'])
                test_type = "Kruskal-Wallis"
        
            # Check p-value to determine if the feature is statistically significant
            if p_value < 0.05:
                print(f"The feature '{column}' is important using {test_type} (p < 0.05).")
            else:
                print(f"The feature '{column}' is not significant using {test_type} (p >= 0.05).")
        else:
            print(f"Skewness value for '{column}' is not a scalar. Skipping the test.")

The feature 'Age' is not significant using ANOVA (p >= 0.05).
The feature 'BMI' is not significant using ANOVA (p >= 0.05).


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

#### Train the Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train_scaled, y_train)


#### Evaluate the Model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Monte Carlo Simulation for Model Evaluation

#### Monte Carlo Simulation Process

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Number of simulations
n_iterations = 100
accuracy_scores = []

# Initialize the scaler
scaler = StandardScaler()

# Logistic Regression model
model = LogisticRegression()

# Assuming 'X' is your feature set and 'y' is your target variable
for _ in range(n_iterations):
    # Randomly split the data
    X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(100))

    # Identify numerical columns
    numerical_columns = X_train_mc.select_dtypes(include=['float64', 'int64']).columns
    
    # Apply scaling to numerical columns and one-hot encoding to categorical columns
    column_transformer = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_columns),
            ('cat', OneHotEncoder(), X_train_mc.select_dtypes(include=['object']).columns)
        ])

    # Fit and transform the training data and transform the test data
    X_train_transformed = column_transformer.fit_transform(X_train_mc)
    X_test_transformed = column_transformer.transform(X_test_mc)
    
    # Train the logistic regression model
    model.fit(X_train_transformed, y_train_mc)
    
    # Make predictions and evaluate the accuracy
    y_pred_mc = model.predict(X_test_transformed)
    accuracy_scores.append(accuracy_score(y_test_mc, y_pred_mc))

# Analyze the results
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(f"Standard Deviation of Accuracy: {std_accuracy:.4f}")