## Data Analysis


        - 


In [None]:
# package imports go here
import pandas as pd
import numpy as np
import fastparquet as fp
import os
import sys
import pickle
import matplotlib.pyplot as plt
import importlib

sys.path.insert(1, 'pkgs')
import ml_analysis as mlanlys

In [None]:
# Path to results
year = 2021
source_path         = "data/"
final_clean_file    = source_path + 'brfss_' + str(year) + '_clean_final.parquet.gzip'

In [None]:
# Read final cleaned dataset from parquet file
df = pd.read_parquet(final_clean_file, engine="fastparquet")

In [None]:
df

In [None]:
df.columns

In [None]:
diabetes_labels = ['general_health', 'days_health_not_good', 'days_mental_not_good',
        'primary_insurance_source', 'personal_provider', 'years_since_checkup',
        'exercise', 'told_high_blood_pressure', 'year_chol_chckd',
        'high_cholesteral', 'high_cholesteral_meds', 'ever_heart_disease',
        'ever_stroke', 'depressive_disorder', 'ekidney_disease', 'diabetes',
        'marital', 'education_level', 'own_home', 'employment', 'income_level',
        'weight-lbs', 'hearing', 'sight', 'diffwalk', 'flu_shot', 'race_grp5',
        'good_health', 'have_insurance', 'activity_level', 'heart_chd_mi',
        'race_grp8', 'race_grp8a', 'race_grp5a', 'sex', 'age_5yr',
        'weight_kilos', 'body_mass_index', 'BMI_cats', 'ceduc_cats',
        'income _cats', 'smoking_status', 'smoking_yn', 'ecig_yn',
        'drinks_week', 'drinks_cats', 'fruit_juice', 'fruit_times_perday',
        'darkG_vege', 'french_fry', 'potato', 'other_vege', 'tot_fruits',
        'tot_vege', 'fruit_1_yn']

In [None]:
def correlation_matrix(df, labels):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Diabetes Feature Correlation')
#    labels=['Sex','Length','Diam','Height','Whole','Shucked','Viscera','Shell','Rings',]
    ax1.set_xticklabels(labels,fontsize=6)
    ax1.set_yticklabels(labels,fontsize=6)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=[.75,.8,.85,.90,.95,1])
    plt.show()

correlation_matrix(df, diabetes_labels)

Note: It appears that we have some features that are similar:  Correlated with each other

# Check the cleaned dataset

In [None]:
feature_df = df.copy()

In [None]:
num_top = 10
target = 'diabetes'
feature_df.describe()

print(f"Diabetes value counts: feature_df['diabetes'].value_counts")

correlations = feature_df.corrwith(feature_df[target])

desc_sorted_correlations = correlations.sort_values(ascending=False)
asc_sorted_correlations = correlations.sort_values()
# Exclude the target itself if present
desc_top_correlations = desc_sorted_correlations.head(num_top + 1).drop(labels=[target], errors='ignore')
asc_top_correlations = asc_sorted_correlations.head(num_top + 1).drop(labels=[target], errors='ignore')

print("\nTop", num_top, "positive correlations:")
print(desc_top_correlations)

print("\nTop", num_top, "negative correlations:")
print(asc_top_correlations)

**Note:** It does not look like any individual features are correlated with diabetes on their own.

In [None]:
#correlation_df = df.corr()
# Assuming df is your dataframe
correlation_matrix = df.corr()

In [None]:
# Unstack the correlation matrix
correlation_series = correlation_matrix.unstack()

# Convert to DataFrame for better readability
correlation_df = pd.DataFrame(correlation_series, columns=['Correlation'])

# Remove self-correlations by filtering out where the same feature is correlated with itself
correlation_df = correlation_df[correlation_df.index.get_level_values(0) != correlation_df.index.get_level_values(1)]

# Sort by the absolute value of the correlation
correlation_df['AbsoluteCorrelation'] = correlation_df['Correlation'].abs()
sorted_correlation_df = correlation_df.sort_values(by='AbsoluteCorrelation', ascending=False)

# Optionally, you can keep only the top N correlations
sorted_correlation_df.head(40)


**Note:** Based on the inter-correlation above, we can eliminate some columns

---

### Eliminate correlated features

In [None]:
# TBD

### Run initial Tests

In [None]:
from sklearn.datasets import make_regression, make_swiss_roll
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.copy().drop(columns=["diabetes"])
y = df['diabetes']

In [None]:
# Create Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)

In [None]:
data_scaled = [X_train_scaled, X_test_scaled, y_train, y_test]
data_raw    = [X_train, X_test, y_train, y_test]


In [None]:
# reload any changes to mlanlys
importlib.reload(mlanlys)

Find KNN n_neighbors value

In [None]:
#mlanlys.knn_plot(data_scaled)

---

**Note:** From the knn plot above, we will pick a value of 3.

---

### Run analysis on Raw cleaned data.
-  No Scaling of data

In [None]:
# # reload any changes to nlanlys
# importlib.reload(mlanlys)

# k_value = 3
# mlanlys.run_classification_models(data_raw, k_value)

---

### Run analysis on cleaned data that has been scaled
-  Scaled data

In [None]:
# importlib.reload(mlanlys)

# k_value = 3
# mlanlys.run_classification_models(data_scaled, k_value)

---

### 

### Balance data using RandomUndersampling
- May need to use 0/1 diabetes instead of 0/1/2 diabetes as values of 2 will only be 3970.

In [None]:
y_train.value_counts()

In [None]:
# # Import RandomUnderSampler from imblearn
# from imblearn.under_sampling import RandomUnderSampler

# # Instantiate the RandomUnderSampler instance
# rus = RandomUnderSampler(random_state=1)

# # Fit the data to the model
# X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

In [None]:
# Count distinct resampled values
y_resampled.value_counts()

### Evaluate overfitting
- Review scores from above
- see if balance helped with overfitting.

### Improve scores with Hyperparameter Tuning
- Pick best model
- Try GridSearchCV and RandomizedSearchCV

#### GridSearchCV


#### RandomizedSearchCV

### Final Parameter Settings and score

## To DO

- correlation work in features (inter-corrlation)
- handle unbalanced data
    - split the data into equal size for has and does not have diabetes ()
- automate the model
- scaling
- optimization
    - ????????

- Encoding
    from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

- sklearn - Model Evaluation
    - [Cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html)
    - [metrics](https://scikit-learn.org/stable/modules/model_evaluation.html)

# Test Train Split
from sklearn.model_selection import train_test_split

from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

