# Import Libraries

In [162]:
#Importing relevant libraries and modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [163]:
#Load data
df = pd.read_csv('/content/drive/MyDrive/ML Projects/Diabetes Project_SVM/diabetes.csv')

In [164]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data Quality Exploration

In [165]:
def explore_data(df):
  # Checking for missing values
  print("Missing values per column:")
  print(df.isnull().sum().sort_values(ascending=False))
  # Displaying data types
  print("**********")
  print("\nData types of columns:")
  print(df.dtypes)
  # Displaying descriptive statistics
  print("**********")
  print("\nDescriptive statistics:")
  print(df.describe())
  # Check for even distribution of classes
  print("**********")
  print("\nClass distribution:")
  print(df['Outcome'].value_counts())

In [166]:
explore_data(df)

Missing values per column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
**********

Data types of columns:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object
**********

Descriptive statistics:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15

**0 stands for No Diabetes**

**1 stands for Diabetes**

In [167]:
# Getting a rough idea of main contributors to outcome
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


# Data Preparation and Model Training

In [168]:
#Separating Features from Labels
X = df.drop(columns='Outcome', axis=1)
Y = df['Outcome']

In [169]:
#Standardizing the features
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
X = standardized_data

In [170]:
#Split data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [171]:
# Train a Logistic Regression model
model = svm.SVC(kernel='linear')
model.fit(X_train, Y_train)

# Evaluate the model
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

# Accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

print('Accuracy on training data:', training_data_accuracy)
print('Accuracy on test data:', test_data_accuracy)

Accuracy on training data: 0.7866449511400652
Accuracy on test data: 0.7727272727272727


# Implementing Prediction

In [176]:
def diabetes_check(input):
  #Converting manual input to numpy array for processing efficiency
  input_numpy = np.asarray(input)
  #Reshaping array to pass as a single record
  input_reshaped = input_numpy.reshape(1,-1)
  #standardize input
  input_reshaped = scaler.transform(input_reshaped)
  #Getting model to predict outcome based on input
  prediction = model.predict(input_reshaped)


  if prediction[0] == 0:
    print('No Diabetes Predicted')
  else:
    print('Diabetes Predicted')

In [196]:
diabetes_check([0,131,0,0,0,43.2,0.27,26])

[1]
Diabetes Predicted


