In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import sklearn
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree, DecisionTreeClassifier
from sklearn.neural_network import MLPRegressor


# Classification

In [None]:
df = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
df.head()


In [3]:
df.info()
df.describe()
df['Diabetes_binary'].value_counts()

NameError: name 'df' is not defined

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols].hist(figsize=(12,10), bins=20, edgecolor='black')
plt.suptitle('Feature Distributions', fontsize=14)
plt.show()


In [None]:
plt.figure(figsize=(14, 8))
for i, col in enumerate(numeric_cols[:6], 1):
    plt.subplot(2, 3, i)
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

### Logistic Regression

In [None]:
# 1) Split
X = df.drop("Diabetes_binary", axis=1)
y = df["Diabetes_binary"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 2) Pipeline = scaling + logistic regression
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(solver="lbfgs", max_iter=1000))
])

# 3) K-fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X, y, cv=cv, scoring="accuracy")

print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy: %.3f ± %.3f" % (cv_scores.mean(), cv_scores.std()))

# 4) Fit on train and test
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Test Accuracy: %.3f" % accuracy_score(y_test, y_pred))


### Decision Tree

In [None]:
display(df[TARGET_COL].value_counts())

TARGET_COL = 'Diabetes_binary'
SEED = 42

X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

X_train.shape, X_test.shape


((56553, 21), (14139, 21))


clf = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=SEED
)


cv = KFold(n_splits=10, shuffle=True, random_state=SEED)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy: %.4f ± %.4f" % (cv_scores.mean(), cv_scores.std()))



In [None]:
viz_depth = 3
plt.figure(figsize=(14, 8))
plot_tree(
    clf,
    feature_names=X.columns,
    class_names=[str(c) for c in sorted(y.unique())],
    max_depth=viz_depth,
    filled=True,
    fontsize=8
)
plt.title(f"Decision Tree (max_depth visualized = {viz_depth})")
plt.show()

### Support Vector Machine

In [None]:
# Split dataset into testing and training
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Run the model
Svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
Svm_model.fit(X_train_scaled, y_train)
y_pred = Svm_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))

In [4]:
# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(Svm_model, X, y, cv=kf, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean CV accuracy:", np.mean(scores))

### Multi-Layer Perceptron Neural Network

In [None]:
# Stratified train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Build a pipeline
mlp_pipeline = make_pipeline(
    StandardScaler(),
    MLPClassifier(hidden_layer_sizes=(10,), activation='relu',
                  solver='adam', random_state=1, max_iter=300)
)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(mlp_pipeline, X, y, cv=cv, scoring="accuracy")

print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy: %.3f ± %.3f" % (cv_scores.mean(), cv_scores.std()))

# Fit and test accuracy
mlp_pipeline.fit(X_train, y_train)
y_pred = mlp_pipeline.predict(X_test)
print("Test Accuracy: %.3f" % accuracy_score(y_test, y_pred))


# Regression


In [None]:
data = pd.read_csv('winequality-red.csv', sep=';', index_col=False)
data.head()

In [None]:
# Data preprocessing
# desribe the data
data.describe() # column with possible outliers - free sulphur dioxide, total sulfur dioxide, residual sugar,
# Use box plot to check the value range
data_mean = data.iloc[:, :]
data_mean.plot(kind='box', subplots=True, layout=(8,4), sharex=False,
sharey=False, fontsize=12, figsize=(15,20));

In [None]:
# Compare the features data ranges
fig,ax=plt.subplots(1,figsize=(20,8))
sns.boxplot(data=data.iloc[:, 1:12],ax=ax)

In [None]:
# Feature Normalisation
X = data.drop('quality', axis=1)
y = data['quality'].values.reshape(-1, 1)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
fig,ax=plt.subplots(1,figsize=(20,8))
sns.boxplot(data=X_scaled,ax=ax)

X_biased = np.c_[X_scaled, np.ones((X_scaled.shape[0], 1))]

### Linear Regression

In [None]:
alpha = 0.1
max_iterations = 1000
m, n = X_biased.shape

# Gradient Descent
def gradient_descent(X, y, alpha, max_iterations, shape):
    m, n = shape
    w = np.random.randn(n, 1)

    for iteration in range(max_iterations):
        gradients = (2/m) * X.T.dot(X.dot(w) - y)
        w = w - alpha * gradients

    return w

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=15)
mse_scores = []


for train_index, val_index in kf.split(X_biased):
    X_train, X_val = X_biased[train_index], X_biased[val_index]
    y_train, y_val = y[train_index], y[val_index]

    w = gradient_descent(X_train, y_train, alpha, max_iterations, X_train.shape)
    y_predicted = np.round(np.array(X_val).dot(w))
    mse = mean_squared_error(y_val, y_predicted)
    mse_scores.append(mse)


print("Average MSE:", np.mean(mse_scores))


### Support Vector Machine

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=15)
mse_scores = []

for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Initialize and train SVM regressor
    svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
    svr_model.fit(X_train, y_train.ravel())  # .ravel() flattens y to 1D
    # Predict and evaluate
    y_pred = np.round(svr_model.predict(X_val))
    mse = mean_squared_error(y_val, y_pred)
    mse_scores.append(mse)

print("Average MSE (SVR):", np.mean(mse_scores))

### Decision Tree

In [None]:
# Set up K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=15)
mse_scores = []


for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]


    # Initialize and train Decision Tree Regressor
    dt_model = DecisionTreeRegressor(max_depth=5, random_state=15)
    dt_model.fit(X_train, y_train)

    y_pred = np.round(dt_model.predict(X_val))
    mse = mean_squared_error(y_val, y_pred)
    mse_scores.append(mse)

print("Average MSE (Decision Tree):", np.mean(mse_scores))

### Multi-Layer Perceptron Neural Network

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=15)
mse_scores = []

for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Initialize and train MLP Regressor
    mlp_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam',
                             max_iter=500, random_state=15)
    mlp_model.fit(X_train, y_train.ravel())

    # Predict and evaluate
    y_pred = np.round(mlp_model.predict(X_val))
    mse = mean_squared_error(y_val, y_pred)
    mse_scores.append(mse)

print("Average MSE (MLP):", np.mean(mse_scores))