In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
#import pipeline_utilities_v2 as p_utils
from pipeline_utilities_v2 import model_generator
from pipeline_utilities_v2_copy import model_generator
from sklearn.pipeline import make_pipeline, Pipeline

In [2]:
# Load dataframe
df = pd.read_csv('https://archive.ics.uci.edu/static/public/329/data.csv')
df.head()

Unnamed: 0,quality,pre_screening,ma1,ma2,ma3,ma4,ma5,ma6,exudate1,exudate2,exudate3,exudate3.1,exudate5,exudate6,exudate7,exudate8,macula_opticdisc_distance,opticdisc_diameter,am_fm_classification,Class
0,1,1,22,22,22,19,18,14,49.895756,17.775994,5.27092,0.771761,0.018632,0.006864,0.003923,0.003923,0.486903,0.100025,1,0
1,1,1,24,24,22,18,16,13,57.709936,23.799994,3.325423,0.234185,0.003903,0.003903,0.003903,0.003903,0.520908,0.144414,0,0
2,1,1,62,60,59,54,47,33,55.831441,27.993933,12.687485,4.852282,1.393889,0.373252,0.041817,0.007744,0.530904,0.128548,0,1
3,1,1,55,53,53,50,43,31,40.467228,18.445954,9.118901,3.079428,0.840261,0.272434,0.007653,0.001531,0.483284,0.11479,0,0
4,1,1,44,44,44,41,39,27,18.026254,8.570709,0.410381,0.0,0.0,0.0,0.0,0.0,0.475935,0.123572,0,1


In [3]:
# Explore dataframe for number of row and columns, any null values and datatypes. 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151 entries, 0 to 1150
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   quality                    1151 non-null   int64  
 1   pre_screening              1151 non-null   int64  
 2   ma1                        1151 non-null   int64  
 3   ma2                        1151 non-null   int64  
 4   ma3                        1151 non-null   int64  
 5   ma4                        1151 non-null   int64  
 6   ma5                        1151 non-null   int64  
 7   ma6                        1151 non-null   int64  
 8   exudate1                   1151 non-null   float64
 9   exudate2                   1151 non-null   float64
 10  exudate3                   1151 non-null   float64
 11  exudate3.1                 1151 non-null   float64
 12  exudate5                   1151 non-null   float64
 13  exudate6                   1151 non-null   float

In [4]:
# Determine if the target is balanced or imbalanced.
df['Class'].value_counts()

Class
1    611
0    540
Name: count, dtype: int64

The target column "Class" is balanced. 

In [5]:
# Additional Columns to get value counts for
columns_to_count = ['quality', 'pre_screening', 'am_fm_classification']

# Iterating through each specified column and printing value counts
for column in columns_to_count:
    print(f"Value counts for {column}:")
    print(df[column].value_counts())
    print() 

Value counts for quality:
quality
1    1147
0       4
Name: count, dtype: int64

Value counts for pre_screening:
pre_screening
1    1057
0      94
Name: count, dtype: int64

Value counts for am_fm_classification:
am_fm_classification
0    764
1    387
Name: count, dtype: int64



The dataset contains largely high quality images, most with severe retinal abnormality. 

In [6]:
# Create X & Y variables 
df = df.rename(columns={'Class': 'y'})
X = df.drop('y', axis=1)
y = df['y']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
X_train.describe()

In [None]:
test = model_generator(X_test, y_test, X_train, y_train)

In [None]:
# Check if each feature follow a normal distrbution. If so, use StandardScalar (PCA Linear and Logistic Regression and Support Vector Models)
# Histograms to see the distribution shapes.
# Q-Q Plots to compare the distribution of the features against a theoretically normal distribution. 
# The points should fall approximately along the reference line if the distribution is normal. Deviations from the line suggest the data may not be normally distributed.

# Function to plot histograms and Q-Q plots for each feature
def plot_distributions(X):
    fig, axs = plt.subplots(len(X.columns), 2, figsize=(10, 4 * len(df.columns)))
    
    for i, column in enumerate(X.columns):
        # Histogram
        axs[i, 0].hist(X[column], bins=20, alpha=0.7, color='blue', edgecolor='black')
        axs[i, 0].set_title(f'Histogram of {column}')
        
        # Q-Q plot
        stats.probplot(X[column], dist="norm", plot=axs[i, 1])
        axs[i, 1].set_title(f'Q-Q Plot of {column}')
        
    plt.tight_layout()
    plt.show()

plot_distributions(X)


In [None]:
# Scale X-train and X-test data using MinMaxScalar
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled
X_test_scaled

In [None]:
# Check the max and min of the scaled training and testing sets
print("Scaled data min/max (MinMax):")
print("Training data min:",X_train_scaled.min())
print("Training data max:",X_train_scaled.max())
print("Testing data min:",X_test_scaled.min())
print("Testing data max:",X_test_scaled.max())

In [None]:
# Make predictions and  Print the name of the classifier and its accuracy

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

# Define Models
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Support Vector Machine", SVC()),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Extremely Random Trees", ExtraTreesClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier()),
    ("AdaBoost", AdaBoostClassifier()),
    ("Naive Bayes", GaussianNB())
]

for name, model in models:
    pipeline = make_pipeline(MinMaxScaler(), model)
    
    # Train the pipeline
    pipeline.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = pipeline.predict(X_test_scaled)
    
    # Evaluation
    print(f"{name} - Accuracy: {pipeline.score(X_test_scaled, y_test):.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print()


In [None]:
# Making predictions
predictions = p_utils.pipeline(X_test)

In [None]:
 # Print the name of the classifier and its accuracy
print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')

In [None]:
# Create a decision tree graph
dot_data = tree.export_graphviz(
    clf, out_file=None, 
    feature_names= X.columns,  
    class_names= ["contains signs of DR", "no signs of DR "],  
    filled=True, rounded=True,  
    special_characters=True)  

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('DR.png')

# Show graph
Image(graph.create_png())