# Model Quality and Improvement Project

## 1. Import the required Libraries and loading the data

In [75]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

diabetes = pd.read_csv('https://bit.ly/DiabetesDS')

## 2. Data cleaning and Preparation:
a. View first few records
b. Check the shape of data
c. Check for and deal with missing values
d. Check for and deal with duplicates

In [76]:
# Read the first 10 records

diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [77]:
# Check the shape of the data

diabetes.shape

(768, 9)

In [78]:
# Check for any missing values

diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

There are no missing values.

In [79]:
# Check for duplicates

diabetes.duplicated().sum()

0

There are no duplicates

## 3. Splitting the data into training and validation sets

In [80]:
features = diabetes.drop('Outcome', axis = 1)
target = diabetes['Outcome']

# Split data into training and validation sets

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345
)

print(features_train.shape)
print(features_valid.shape)
print(target_train.shape)
print(target_valid.shape)

(576, 8)
(192, 8)
(576,)
(192,)


## 4.Data modelling and evaluation using various classification models

In [81]:
# Train model and predict: Decision Tree

model = DecisionTreeClassifier()
model.fit(features_train, target_train)

predicted_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predicted_valid))

0.765625


In [82]:
# Train model and predict: Random Forest

model = RandomForestClassifier()
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predicted_valid))

0.8020833333333334


In [83]:
# Train model and predict: LogisticRegression

model = LogisticRegression(random_state=12345)
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predicted_valid))

0.828125


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 5. Hyperparameter tuning to improve the models.

### a. Decision Trees

In [84]:
# Try out various values of max_depth to see effect on accuracy

for depth in range(1, 11, 1):
  model = DecisionTreeClassifier(random_state=12345, max_depth = depth)
  model.fit(features_train, target_train)

  predicted_valid = model.predict(features_valid)
  print(accuracy_score(target_valid, predicted_valid))


0.7708333333333334
0.7708333333333334
0.7604166666666666
0.75
0.8177083333333334
0.8229166666666666
0.765625
0.7604166666666666
0.7552083333333334
0.734375


The best accuracy of 0.8229 is achieved with max_depth of 6.

### b. Random Forest

In [85]:
# Try out various values of n_estimators to see its effect on accuracy

model = RandomForestClassifier(n_estimators=15, max_depth=10, random_state=12345)
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predicted_valid))

0.8385416666666666


The best accuracy 0f 0.8385 is achieved using 15 estimators of max_depth 10.

### c. Logistic Regression

In [86]:
# LogisticRegression hyperparameters

model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predicted_valid))

0.7916666666666666


## 6. Findings and Recommendations:

The best accuracy is achieved using the random forest model and this is what is recommended for use in this case.
