Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from streamlit_option_menu import option_menu
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

PIMA Diabetes Dataset

In [2]:
# test_import.py
from sklearn import datasets
print("scikit-learn is installed correctly")


scikit-learn is installed correctly


In [3]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('diabetes.csv') 

In [4]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,Patient Id,Gender,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,m,148,72,35,0,33.6,0.627,50,1
1,2,f,85,66,29,0,26.6,0.351,31,0
2,3,m,183,64,0,0,23.3,0.672,32,1
3,4,f,89,66,23,94,28.1,0.167,21,0
4,5,m,137,40,35,168,43.1,2.288,33,1


In [5]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(830, 10)

In [6]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,Patient Id,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,415.5,121.471084,68.863855,20.46747,82.060241,31.988072,0.4703,33.254217,0.354217
std,239.744656,31.659695,19.867887,15.881521,117.412951,7.934926,0.327504,11.623743,0.478564
min,1.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,208.25,100.0,62.0,0.0,0.0,27.325,0.24325,24.0,0.0
50%,415.5,118.0,72.0,23.0,27.0,32.0,0.3705,29.0,0.0
75%,622.75,141.0,80.0,32.0,130.0,36.6,0.62675,41.0,1.0
max,830.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
diabetes_dataset['Outcome'].value_counts()

Outcome
0    536
1    294
Name: count, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [8]:
print(diabetes_dataset.dtypes)


Patient Id                    int64
Gender                       object
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


In [9]:
diabetes_dataset = diabetes_dataset.apply(pd.to_numeric, errors='coerce')


In [10]:
diabetes_dataset = diabetes_dataset.fillna(0)  # Replace NaNs with 0
# Or use another strategy, e.g., diabetes_dataset = diabetes_dataset.dropna()


In [11]:
result = diabetes_dataset.groupby('Outcome').mean()
print(result)


         Patient Id  Gender     Glucose  BloodPressure  SkinThickness  \
Outcome                                                                 
0        419.188433     0.0  110.395522      67.768657      19.350746   
1        408.775510     0.0  141.663265      70.860544      22.503401   

            Insulin        BMI  DiabetesPedigreeFunction        Age  
Outcome                                                              
0         70.757463  30.359888                  0.429815  31.061567  
1        102.666667  34.956463                  0.544109  37.251701  


In [12]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Patient Id,Gender,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,419.188433,0.0,110.395522,67.768657,19.350746,70.757463,30.359888,0.429815,31.061567
1,408.77551,0.0,141.663265,70.860544,22.503401,102.666667,34.956463,0.544109,37.251701


In [13]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [14]:
print(X)

     Patient Id  Gender  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0             1     0.0      148             72             35        0  33.6   
1             2     0.0       85             66             29        0  26.6   
2             3     0.0      183             64              0        0  23.3   
3             4     0.0       89             66             23       94  28.1   
4             5     0.0      137             40             35      168  43.1   
..          ...     ...      ...            ...            ...      ...   ...   
825         826     0.0      134             58             20      291  26.4   
826         827     0.0      102             74              0        0  29.5   
827         828     0.0      187             50             33      392  33.9   
828         829     0.0      173             78             39      185  33.8   
829         830     0.0       94             72             18        0  23.1   

     DiabetesPedigreeFuncti

In [15]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
825    0
826    0
827    1
828    1
829    0
Name: Outcome, Length: 830, dtype: int64


Train Test Split

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [17]:
print(X.shape, X_train.shape, X_test.shape)

(830, 9) (664, 9) (166, 9)


Training the Model

In [18]:
classifier = svm.SVC(kernel='linear')

In [19]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [20]:
# number of rows and columns in the dataframe
diabetes_dataset.shape

(830, 10)

In [21]:
diabetes_dataset.isnull()

Unnamed: 0,Patient Id,Gender,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
825,False,False,False,False,False,False,False,False,False,False
826,False,False,False,False,False,False,False,False,False,False
827,False,False,False,False,False,False,False,False,False,False
828,False,False,False,False,False,False,False,False,False,False


In [22]:
diabetes_dataset.isnull()

Unnamed: 0,Patient Id,Gender,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
825,False,False,False,False,False,False,False,False,False,False
826,False,False,False,False,False,False,False,False,False,False
827,False,False,False,False,False,False,False,False,False,False
828,False,False,False,False,False,False,False,False,False,False


In [23]:
import seaborn as sns
sns.heatmap(diabetes_dataset.isnull())

<Axes: >

Model Evaluation

In [24]:
from matplotlib import pyplot as plt


plt.figure(figsize=(10,5))
sns.countplot(x='Glucose',data=diabetes_dataset)

<Axes: xlabel='Glucose', ylabel='count'>

Accuracy Score

In [25]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [26]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7740963855421686


In [27]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [28]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7771084337349398


In [29]:
import plotly.graph_objects as go

In [30]:
label = diabetes_dataset.Age.sort_values().unique()


In [31]:
# Assuming diabetes_dataset is your DataFrame

# Sort and get unique values from the 'age' column
label = diabetes_dataset['Age'].sort_values().unique()

# Assign the target column to target variable, assuming the target column is 'target'
target = diabetes_dataset['SkinThickness']

# Printing to verify the results
print("Unique ages:", label)
print("Target variable:", target.head())


Unique ages: [21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
 69 70 72 81]
Target variable: 0    35
1    29
2     0
3    23
4    35
Name: SkinThickness, dtype: int64


In [32]:
label = diabetes_dataset.Age.sort_values().unique()       
target = diabetes_dataset['SkinThickness']  # Accessing SkinThickness column


In [33]:
target = diabetes_dataset[diabetes_dataset['SkinThickness'] > 0]['SkinThickness']


In [34]:
fig = go.Figure()
fig.add_trace(go.Bar(x=label,y=target))
fig.update_layout(title = 'Diabetes Patients ',xaxis=dict(title='Age'),yaxis=dict(title='SkinThickness'))
fig.show()

In [35]:
import numpy as np

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import log_loss, f1_score
from sklearn.model_selection import cross_val_score
import numpy as np 
acc_dict = {}
# create the data
X = diabetes_dataset.drop('Age',axis = 1)
y = diabetes_dataset['Glucose']
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [37]:
# from matplotlib import pyplot as plt
# from sklearn.ensemble import  RandomForestClassifier
# # create model
# model = RandomForestClassifier()

# # fit the data in the model
# model.fit(X_train,y_train)

# y_pred_randomF = model.predict(X_test)
# print('Accuracy score : ',accuracy_score(y_test, y_pred_randomF)*100)

# acc_dict['RFC_log_loss'] = log_loss(y_test, y_pred_randomF)
# acc_dict['RFC_F!1_Score'] = f1_score(y_test, y_pred_randomF,average='weighted')
# # prediction visualization
# plt.imshow(np.log(confusion_matrix(y_test,y_pred_randomF)),cmap = 'Blues',interpolation = 'nearest')
# plt.ylabel('True')
# plt.xlabel('Predicted')
# plt.show()

## KMeans Clustering

In [38]:
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming X_train, X_test, y_train, y_test are your training and test data
# Replace them with your actual data

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize KMeans model
kmeans = KMeans(n_clusters=2)  # Assuming 2 clusters, you can adjust this number

# Fit KMeans model on the imputed training data
kmeans.fit(X_train_imputed)

# Predict clusters for the test data
clusters = kmeans.predict(X_test_imputed)

# Calculate accuracy score (if true labels are available)
accuracy = accuracy_score(y_test, clusters)  # Replace y_test with your actual labels
print('Accuracy score:', accuracy)

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(X_test_imputed[:, 0], X_test_imputed[:, 1], c=clusters, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('KMeans Clustering')
plt.colorbar(label='Cluster')
plt.show()


Accuracy score: 0.0



FigureCanvasAgg is non-interactive, and thus cannot be shown



## Decision Tree Classifier

In [39]:
# from sklearn.tree import DecisionTreeClassifier
# tree_ = DecisionTreeClassifier()
# tree_.fit(X_train,y_train)
# y_pred = tree_.predict(X_test)
# print('Accuracy score : ',accuracy_score(y_test, y_pred)*100)
# acc_dict['Tree_log_loss'] = log_loss(y_test,y_pred)
# acc_dict['Tree_f!1_score'] = f1_score(y_test,y_pred)

# # prediction visualization
# plt.imshow(np.log(confusion_matrix(y_test,y_pred)),cmap = 'Blues',interpolation = 'nearest')
# plt.ylabel('True')
# plt.xlabel('Predicted')
# plt.show()

## Support Vector Classifier

In [40]:
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)

model = SVC()
model.fit(X_train_imputed, y_train)

X_test_imputed = imputer.transform(X_test)
y_pred = model.predict(X_test_imputed)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy score :', accuracy * 100)

# prediction visualization
plt.imshow(np.log(confusion_matrix(y_test,y_pred)),cmap = 'Blues',interpolation = 'nearest')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()


Accuracy score : 2.8846153846153846



divide by zero encountered in log


FigureCanvasAgg is non-interactive, and thus cannot be shown



Making a Predictive System

In [41]:
import numpy as np
from sklearn.svm import SVC

# Assuming 'classifier' is your trained SVC model

# Example training data shape (replace with actual shape)
# X_train.shape = (n_samples, 10)

# Input data (should have 10 features)
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51, 0)  # Adjust this tuple to have the correct number of features

# Convert the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make a prediction
prediction = classifier.predict(input_data_reshaped)
print(prediction)

# Output the prediction
if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')


[1]
The person is diabetic



X does not have valid feature names, but SVC was fitted with feature names



In [42]:
# input_data = (5,166,72,19,175,25.8,0.587,51)

# # changing the input_data to numpy array
# input_data_as_numpy_array = np.asarray(input_data)

# # reshape the array as we are predicting for one instance
# input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# prediction = classifier.predict(input_data_reshaped)
# print(prediction)

# if (prediction[0] == 0):
#   print('The person is not diabetic')
# else:
#   print('The person is diabetic')

Saving the trained model

In [43]:
import pickle

In [44]:
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [45]:
# loading the saved model
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))

In [46]:
input_data = (5,166,72,19,175,25.8,0.587,51,45)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic



X does not have valid feature names, but SVC was fitted with feature names



In [47]:
for column in X.columns:
  print(column)

Patient Id
Gender
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Outcome


# Accuracy Report

|   Algorithm                  |   Accuracy score  |      
|------------------------------|-------------------|
|          RFC                 |      100.00       |       
|   Decision Tree Classifier   |      100.00       | 
|   Support Vector Classifier  |      70.00        |      
