<a href="https://colab.research.google.com/github/isaumarusule/isaumarusule/blob/main/SEC7001(2205507).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# INSTALL LIBRARIES

!pip install mlxtend
!pip install pandas-profiling
!pip install -U scikit-learn
!pip install -U imbalanced-learn

In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from imblearn.over_sampling import SMOTE
from mlxtend.plotting import plot_confusion_matrix
from pandas_profiling import ProfileReport
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
# Load Dataset
df1 = pd.read_csv("/content/diabetes_binary_health_indicators_BRFSS2015.csv")

In [None]:
# Dataset Shape
df1.shape

In [None]:
# Dataset Columns
df1.columns

In [None]:
# Dataset Information
df1.info()

In [None]:
# First (5) Rows of Dataset
df1.head()

In [None]:
# Dataset Description
df1.describe().T

In [None]:
# Check Dataset (Missing Values)
df1.isnull().sum()

In [None]:
# Check Dataset (Unique Values)
unique_values = {}
for col in df1.columns:
    unique_values[col] = df1[col].value_counts().shape[0]
pd.DataFrame(unique_values, index=['unique value count']).transpose()

In [None]:
# Check Dataset (Outliers)
plt.figure(figsize = (20,15))
for i,col in enumerate(['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age','Education', 'Income']):
    plt.subplot(4,2,i+1)
    sns.boxplot(x = col, data = df1 ,palette='pastel')
plt.show()

In [None]:
# Check Dataset (Duplicate Rows)
duplicates = df1[df1.duplicated()]
print("Duplicate Rows : ",len(duplicates))
duplicates.head()

In [None]:
# Drop Duplicate Rows (Dataset)
df1.drop_duplicates(inplace = True)
df1.shape

In [None]:
# Visualize Columns (Dataset)
df1.hist(figsize=(20, 15), color='lightblue');

In [None]:
# Check Correlation (Dataset)
df1.corr()

In [None]:
# Correlation Heatmap (Diabetes)
plt.figure(figsize=(20, 10))
cmap = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(df1.corr(), annot=True, cmap=cmap, center=0)
plt.title("Correlation Heatmap (Diabetes)")
plt.show()

In [None]:
# Correlation Bar Plot (Diabetes)
plt.figure(figsize=(20, 8))
correlation_with_diabetes = df1.drop('Diabetes_binary', axis=1).corrwith(df1['Diabetes_binary'])
colors = ['lightblue' if corr >= 0 else 'lightcoral' for corr in correlation_with_diabetes]
correlation_with_diabetes.plot(kind='bar', grid=True, figsize=(20, 8), title="Correlation Plot (Diabetes)", color=colors)
plt.show()

In [None]:
# Diabetes Count Distribution in Dataset
df1["Diabetes_binary"].value_counts()


In [None]:
# Create Duplicate (Dataset)
df2 = df1.copy()

In [None]:
# Create Categorial Column (Diabetic/Non-Diabetic)
df2["Diabetes_Status"] = df2["Diabetes_binary"].replace({0: "Non-Diabetic", 1: "Diabetic"})
df2.head()

In [None]:
# Features Category (Dataset)

df2.Age[df2['Age'] == 1] = '18 to 24'
df2.Age[df2['Age'] == 2] = '25 to 29'
df2.Age[df2['Age'] == 3] = '30 to 34'
df2.Age[df2['Age'] == 4] = '35 to 39'
df2.Age[df2['Age'] == 5] = '40 to 44'
df2.Age[df2['Age'] == 6] = '45 to 49'
df2.Age[df2['Age'] == 7] = '50 to 54'
df2.Age[df2['Age'] == 8] = '55 to 59'
df2.Age[df2['Age'] == 9] = '60 to 64'
df2.Age[df2['Age'] == 10] = '65 to 69'
df2.Age[df2['Age'] == 11] = '70 to 74'
df2.Age[df2['Age'] == 12] = '75 to 79'
df2.Age[df2['Age'] == 13] = '80 or older'

df2.Diabetes_binary[df2['Diabetes_binary'] == 0] = 'No Diabetes'
df2.Diabetes_binary[df2['Diabetes_binary'] == 1] = 'Diabetes'

df2.HighBP[df2['HighBP'] == 0] = 'No High'
df2.HighBP[df2['HighBP'] == 1] = 'High BP'

df2.HighChol[df2['HighChol'] == 0] = 'No High Cholesterol'
df2.HighChol[df2['HighChol'] == 1] = 'High Cholesterol'

df2.CholCheck[df2['CholCheck'] == 0] = 'No Cholesterol Check in 5 Years'
df2.CholCheck[df2['CholCheck'] == 1] = 'Cholesterol Check in 5 Years'

df2.Smoker[df2['Smoker'] == 0] = 'No'
df2.Smoker[df2['Smoker'] == 1] = 'Yes'

df2.Stroke[df2['Stroke'] == 0] = 'No'
df2.Stroke[df2['Stroke'] == 1] = 'Yes'

df2.HeartDiseaseorAttack[df2['HeartDiseaseorAttack'] == 0] = 'No'
df2.HeartDiseaseorAttack[df2['HeartDiseaseorAttack'] == 1] = 'Yes'

df2.PhysActivity[df2['PhysActivity'] == 0] = 'No'
df2.PhysActivity[df2['PhysActivity'] == 1] = 'Yes'

df2.Fruits[df2['Fruits'] == 0] = 'No'
df2.Fruits[df2['Fruits'] == 1] = 'Yes'

df2.Veggies[df2['Veggies'] == 0] = 'No'
df2.Veggies[df2['Veggies'] == 1] = 'Yes'

df2.HvyAlcoholConsump[df2['HvyAlcoholConsump'] == 0] = 'No'
df2.HvyAlcoholConsump[df2['HvyAlcoholConsump'] == 1] = 'Yes'

df2.AnyHealthcare[df2['AnyHealthcare'] == 0] = 'No'
df2.AnyHealthcare[df2['AnyHealthcare'] == 1] = 'Yes'

df2.NoDocbcCost[df2['NoDocbcCost'] == 0] = 'No'
df2.NoDocbcCost[df2['NoDocbcCost'] == 1] = 'Yes'

df2.GenHlth[df2['GenHlth'] == 5] = 'Excellent'
df2.GenHlth[df2['GenHlth'] == 4] = 'Very Good'
df2.GenHlth[df2['GenHlth'] == 3] = 'Good'
df2.GenHlth[df2['GenHlth'] == 2] = 'Fair'
df2.GenHlth[df2['GenHlth'] == 1] = 'Poor'

df2.DiffWalk[df2['DiffWalk'] == 0] = 'No'
df2.DiffWalk[df2['DiffWalk'] == 1] = 'Yes'

df2.Sex[df2['Sex'] == 0] = 'Female'
df2.Sex[df2['Sex'] == 1] = 'Male'

df2.Education[df2['Education'] == 1] = 'Never Attended School'
df2.Education[df2['Education'] == 2] = 'Elementary'
df2.Education[df2['Education'] == 3] = 'Junior High School'
df2.Education[df2['Education'] == 4] = 'Senior High School'
df2.Education[df2['Education'] == 5] = 'Undergraduate Degree'
df2.Education[df2['Education'] == 6] = 'Graduate Degree'

df2.Income[df2['Income'] == 1] = 'Less Than $10,000'
df2.Income[df2['Income'] == 2] = 'Less than $15,000'
df2.Income[df2['Income'] == 3] = 'Less than $20,000'
df2.Income[df2['Income'] == 4] = 'Less than $25,000'
df2.Income[df2['Income'] == 5] = 'Less Than $35,000'
df2.Income[df2['Income'] == 6] = 'Less than $50,000'
df2.Income[df2['Income'] == 7] = 'Less Than $75,000'
df2.Income[df2['Income'] == 8] = '$75,000 or More'

In [None]:
# Pie Chart Distribution (Features)
def create_pie_chart(dataframe, x_column):
    counts = dataframe[x_column].value_counts()
    return counts
columns = ['Diabetes_binary','HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
           'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
           'DiffWalk', 'Sex']
fig, ax = plt.subplots(5, 3, figsize=(15, 15))  # Adjust the grid dimensions as needed
axes = ax.ravel()
c = min(len(columns), len(axes))  # Use the minimum of available columns and axes
for i in range(c):
    counts = create_pie_chart(df2, columns[i])
    axes[i].pie(counts, labels=counts.index, autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
    axes[i].set_title(columns[i])
plt.tight_layout()
plt.show()

In [None]:
# Pivot Chart Distribution (Features)
def create_plot_pivot(dataframe, x_column):
    return dataframe.pivot_table(index=x_column, columns='Diabetes_binary', aggfunc='size')
columns = ['Diabetes_binary','HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
           'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
           'DiffWalk', 'Sex', 'Age', 'GenHlth', 'Education']
fig, ax = plt.subplots(3, 6, figsize=(20, 20))  # Adjust the grid dimensions as needed
axes = ax.ravel()
c = min(len(columns), len(axes))  # Use the minimum of available columns and axes
for i in range(c):
    create_plot_pivot(df2, columns[i]).plot(kind='bar', stacked=True, ax=axes[i], color=['lightcoral', 'lightblue'])
    axes[i].set_xlabel(columns[i])
plt.tight_layout()
plt.show()

In [None]:
# Plot Chart Distribution (Features)
def create_plot_pivot(dataframe, x_column):
    return dataframe.pivot_table(index=x_column, columns='Diabetes_binary', aggfunc='size')
columns = ['Income', 'MentHlth', 'PhysHlth', 'BMI']
fig, ax = plt.subplots(2, 2, figsize=(20, 20))  # Adjust the grid dimensions as needed
axes = ax.ravel()
c = min(len(columns), len(axes))  # Use the minimum of available columns and axes
for i in range(c):
    create_plot_pivot(df2, columns[i]).plot(kind='bar', stacked=True, ax=axes[i], color=['lightcoral', 'lightblue'])
    axes[i].set_xlabel(columns[i])
plt.tight_layout()
plt.show()

In [None]:
# Diabetes Count Distribution in Dataset
df2["Diabetes_Status"].value_counts()

In [None]:
# Diabetes Bar Chart Distribution in Dataset
plt.figure(figsize=(8, 6))
sns.countplot(data=df2, x="Diabetes_Status", palette={"Non-Diabetic": "lightblue", "Diabetic": "Lightcoral"})
plt.title("Distribution of Diabetes Status")
plt.xlabel("Diabetes Status")
plt.ylabel("Count")
plt.show()

In [None]:
# Diabetes Pie Chart Percentage Distribution in Dataset
labels = ["Non-Diabetic", "Diabetic"]
sizes = df2["Diabetes_Status"].value_counts()
colors = ['Lightblue', 'Lightcoral']
explode = (0, 0.1)  # Explode the "Diabetic" slice
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140, explode=explode)
plt.title("Distribution of Diabetic and Non-Diabetics")
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# ANOVA TEST

X = df1.iloc[:,1:]
Y = df1.iloc[:,0]

In [None]:
# ANOVA FEATURE SELECTION

# Define Feature selection
fs = SelectKBest(score_func=f_classif, k=10)
# Apply Features
X_selected = fs.fit_transform(X, Y)
print(X_selected.shape)

In [None]:
# Create dataframe
pd.DataFrame(X_selected).head(3)

In [None]:
# Chi Square
# Apply SelectKBest class to extract top 10 best features
BestFeatures = SelectKBest(score_func=chi2, k=10)
fit = BestFeatures.fit(X, Y)

# Create a DataFrame for the scores and columns
df_scores = pd.DataFrame(fit.scores_, columns=['Score'])
df_columns = pd.DataFrame(X.columns, columns=['Feature'])

# Concatenate the two dataframes for better visualization
f_Scores = pd.concat([df_columns, df_scores], axis=1)
f_Scores.columns = ['Feature', 'Score']

# Sort the dataframe by score in descending order
f_Scores = f_Scores.sort_values(by='Score', ascending=False)

# Display the dataframe with arranged scores
f_Scores

In [None]:
# Feature Importance (Selection)

# Sort the features by score in descending order
sorted_scores = f_Scores.sort_values(by='Score', ascending=False)
# Define the color for the bars (light blue)
bar_color = 'lightcoral'
# Plot the bar chart with the specified color
plt.figure(figsize=(10, 5))
plt.bar(sorted_scores['Feature'], sorted_scores['Score'], color=bar_color)
plt.xlabel('Feature')
plt.ylabel('Score')
plt.title('Features Score (Chi-Squared)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# Display the plot
plt.show()

In [None]:
# Feature Selection (Importance)

print(f_Scores.nlargest(16,'Score'))

In [None]:
# Drop Columns (Least Featured)
columns = ["NoDocbcCost", "Veggies", "Fruits", "CholCheck", "AnyHealthcare"]
df1.drop(columns, axis=1, inplace=True)

In [None]:
# Drop Column (Diabetes_binary)
X=df1.drop("Diabetes_binary",axis=1)
Y=df1["Diabetes_binary"]

In [None]:
# Check Dataset Imbalance
Y.value_counts()

In [None]:
# Sample Dataset
nm = NearMiss(version = 1 , n_neighbors = 10)
x_sm, y_sm= nm.fit_resample(X,Y)
x_sm.shape, y_sm.shape

In [None]:
# Train and Test Dataset
X_train , X_test , Y_train , Y_test = train_test_split(x_sm,y_sm, test_size=0.2 , random_state=42)

In [None]:
# Scale and Transform Dataset
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.fit_transform(X_test)
x_sm.shape, y_sm.shape

In [None]:
# MODELLING

# Logistic Regression

lg = LogisticRegression(max_iter = 1500)
lg.fit(X_train , Y_train)

In [None]:
# Model Prediction
y_pred=lg.predict(X_test)
print('Training set score: {:.4f}'.format(lg.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(lg.score(X_test, Y_test)))

In [None]:
# Classification Report
report = classification_report(Y_test,y_pred )
print(report)

In [None]:
# Evaluate Result (MAE)
mae = mean_absolute_error(Y_test, y_pred)
print('Mean Absolute Error : ' + str(mae))

In [None]:
# Desicion Tree model

dt = DecisionTreeClassifier( max_depth= 12)
dt.fit(X_train , Y_train)

In [None]:
# Model Prediction
y_pred=dt.predict(X_test)
print('Training set score: {:.4f}'.format(dt.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(dt.score(X_test, Y_test)))

In [None]:
# Classification Report
report = classification_report(Y_test,y_pred )
print(report)

In [None]:
# Evaluate Result (MAE)
mae = mean_absolute_error(Y_test, y_pred)
print('Mean Absolute Error : ' + str(mae))

In [None]:
# SVM

clf = SVC(kernel='rbf', C=1.0)
# train the model
clf.fit(X_train, Y_train)

In [None]:
# Model Prediction
y_pred=clf.predict(X_test)
print('Training set score: {:.4f}'.format(clf.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(clf.score(X_test, Y_test)))

In [None]:
# Classification Report
report = classification_report(Y_test,y_pred )
print(report)

In [None]:
# Evaluate Result (MAE)
mae = mean_absolute_error(Y_test, y_pred)
print('Mean Absolute Error : ' + str(mae))

In [None]:
# XGBoost

xg = XGBClassifier(eval_metric= 'error', learning_rate= 0.1)
xg.fit(X_train , Y_train)

In [None]:
# Model Prediction
y_pred=xg.predict(X_test)
print('Training set score: {:.4f}'.format(xg.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(xg.score(X_test, Y_test)))

In [None]:
# Classification Report
report = classification_report(Y_test,y_pred )
print(report)

In [None]:
# Evaluate Result (MAE)
mae = mean_absolute_error(Y_test, y_pred)
print('Mean Absolute Error : ' + str(mae))