In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

# Load your diabetes dataset or replace it with your data
diabetes = pd.read_csv('diabetes.csv')

# Use a smaller subset of data for experimentation (adjust as needed)
diabetes_subset = diabetes.sample(frac=0.5, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_subset.loc[:, diabetes_subset.columns != 'Outcome'],
    diabetes_subset['Outcome'],
    stratify=diabetes_subset['Outcome'],
    random_state=66
)

# Apply SMOTE to the training data only
smote = SMOTE(random_state=0)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Create individual classifiers and a scaler
scaler = MinMaxScaler()
rf = RandomForestClassifier(random_state=0, n_jobs=-1)  # Use all available cores for training
dt = DecisionTreeClassifier(random_state=0)
svm = SVC(probability=True, random_state=0)
ann = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=0)

# Scale the resampled training data and testing data
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest with a reduced search space
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=0, n_jobs=-1), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train_resampled)

# Get the best Random Forest model from the search
best_rf_model = grid_search.best_estimator_

# Create a VotingClassifier with the best models
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf_model),
    ('dt', dt),
    ('svm', svm),
    ('ann', ann)
], voting='hard')

# Train the VotingClassifier
voting_clf.fit(X_train_scaled, y_train_resampled)

# Predict using the VotingClassifier
combined_predictions = voting_clf.predict(X_test_scaled)

# Use fuzzy logic for post-processing or refining predictions
def apply_fuzzy_logic(predictions, threshold=80):
    fuzzy_predictions = []

    for prediction in predictions:
        # Apply fuzzy matching with a threshold
        similarity = fuzz.ratio(str(prediction), 'Your_Target_Label')
        if similarity >= threshold:
            fuzzy_predictions.append('Your_Target_Label')
        else:
            fuzzy_predictions.append(prediction)

    return fuzzy_predictions

# Apply fuzzy logic to the combined predictions
fuzzy_combined_predictions = apply_fuzzy_logic(combined_predictions)

# Calculate accuracy of the fuzzy combined predictions
accuracy = accuracy_score(y_test, fuzzy_combined_predictions)
print("Fuzzy Combined Test Set Accuracy: {:.1f}%".format(accuracy * 100))

Fuzzy Combined Test Set Accuracy: 84.4%




In [2]:
# Assuming you've already imported MinMaxScaler and your classifier
# Define the input data
#input_data = (1,85,66,29,0,26.6,0.351,31)
input_data = (6,148,72,35,0,33.6,0.627,50)

# Changing the input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Standardize the input data
std_data = scaler.transform(input_data_reshaped)

# Check if scaler is fitted
if scaler is None:
    print("Scaler is not fitted. Please fit the scaler first.")
else:
    prediction = voting_clf.predict(std_data)

    if prediction[0] == 0:
        print('The person is not diabetic')
    else:
        print('The person is diabetic')

The person is diabetic




In [3]:
import pickle
filename = 'diabetes_kowchik_model2.sav'
pickle.dump(voting_clf, open(filename, 'wb'))

In [4]:
# loading the saved model
loaded_model = pickle.load(open('diabetes_kowchik_model1.sav', 'rb'))

input_data = (6,148,72,35,0,33.6,0.627,50)

# Changing the input_data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Standardize the input data
std_data = scaler.transform(input_data_reshaped)

# Check if scaler is fitted
if scaler is None:
    print("Scaler is not fitted. Please fit the scaler first.")
else:
    prediction = loaded_model.predict(std_data)

    if prediction[0] == 0:
        print('The person is not diabetic')
    else:
        print('The person is diabetic')


The person is diabetic


