### Import the Dependencies

In [49]:
pip install plotly

Defaulting to user installation because normal site-packages is not writeable
Collecting plotly
  Downloading plotly-5.11.0-py2.py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting tenacity>=6.2.0
  Downloading tenacity-8.1.0-py3-none-any.whl (23 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.11.0 tenacity-8.1.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from xgboost import XGBClassifier 

### Prepare the data

In [2]:
# Load the dataset
data = pd.read_csv('/Users/leslie/Desktop/DATA606/diabetes_B (1).csv')

In [5]:
# Deal with missing values
# Select rows where our target is not NaN
data = data[data['Have_Diabetes'].notna()]

# Fill in missing values of other columns with mean 
col_mean = np.ceil(data.mean())

data.fillna(col_mean, inplace=True)

# Check if it works
data.isnull().sum()

Good Health                              0
Health Coverage                          0
High Blood Pressure                      0
High Cholesterol                         0
CHD-MI                                   0
Asthma Status                            0
Diagnosed Arthritis                      0
Race/Ethnicity                           0
BMI Category                             0
Overweight/Obese                         0
Education Level                          0
Income Category                          0
Smoker Status                            0
Heavy Drinker                            0
Consume Fruit                            0
Consume Veggie                           0
Physical Activity Categories             0
Aerobic Reccomendations                  0
Muscle Strengthening Recommendation      0
Good Physical Health                     0
Good Mental Health                       0
High Cost of Medical                     0
Routine Check-up                         0
Taking BP M

In [65]:
# Check for duplicates
#check for row duplication
dupes= data[data.duplicated()]
print('number of rows = ', len(dupes))
dupes.head()

data.drop_duplicates(inplace= True)

number of rows =  9988


In [68]:
key_features = ['Pre-diabetic', 'Taking Insulin','High Blood Pressure', 'BMI Category', 'Age','Good Health', 'Routine Check-up', 'High Cholesterol', 'Income Category', 'CHD-MI','Kidney Disease', 'Race/Ethnicity','Taking BP Meds', 'Heavy Drinker','Have_Diabetes']

In [69]:
# Select columns of great importance to prediction based on previous work
new_df = data.loc[:, key_features]

In [64]:
for col in new_df.columns:
          print(f"Unique values in column '{col}': {new_df[col].unique()}")

Unique values in column 'Pre-diabetic': [0. 1. 2. 7. 9.]
Unique values in column 'Taking Insulin': [1. 0. 9.]
Unique values in column 'High Blood Pressure': [1. 0.]
Unique values in column 'BMI Category': [4. 3. 2. 1.]
Unique values in column 'Age': [ 9.  7. 11. 13. 10. 12.  8.  4.  6.  2.  3.  5.  1.]
Unique values in column 'Good Health': [0. 1.]
Unique values in column 'Routine Check-up': [1. 4. 3. 2. 0. 7. 9.]
Unique values in column 'High Cholesterol': [1. 0.]
Unique values in column 'Income Category': [2. 1. 5. 4. 3.]
Unique values in column 'CHD-MI': [0. 1.]
Unique values in column 'Kidney Disease': [0. 7. 1. 9.]
Unique values in column 'Race/Ethnicity': [1. 7. 2. 3. 6. 4. 8. 5.]
Unique values in column 'Taking BP Meds': [1. 0. 7. 9.]
Unique values in column 'Heavy Drinker': [0. 9. 1.]
Unique values in column 'Have_Diabetes': [0. 1.]


In [70]:
new_df.to_csv('./data_nonull.csv')

In [6]:
# Define X and Y
X = data.drop('Have_Diabetes', axis=1)
y = data.Have_Diabetes

# Split the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=1000)

In [7]:
print(X.shape, X_train.shape, X_test.shape)

(325841, 40) (228088, 40) (97753, 40)


In [22]:
class_size = data['Have_Diabetes'].value_counts()
class_size

0.0    283909
1.0     41932
Name: Have_Diabetes, dtype: int64

In [25]:
print(f'No Diabetes:{class_size[0]} and Having Diabtes:{class_size[1]}')
scale_pos_weight = class_size[0]/class_size[1] # total negative examples / total positive examples
print(f'The estimated sale_pos_weight is {scale_pos_weight}')

No Diabetes:283909 and Having Diabtes:41932
The estimated sale_pos_weight is 6.770700181245827


### Train the model

In [26]:
# Fit the classifier again with previously tuned hyperparameters
xgb_tuned = XGBClassifier(min_child_weight = 1,
                             max_depth=1,
                             n_estimators=152, 
                             scale_pos_weight = scale_pos_weight)  # Handle the class imbalance

xgb_tuned.fit(X_train, y_train)


In [27]:
# Apply the threshold we trained before for higher recall
predicted_xgb = xgb_tuned.predict(X_test, ntree_limit=np.argmax(xgb_tuned.predict_proba(X_test)[:,1] >= 0.0935))

In [30]:
# Load the two performance evaluation functions
def get_scores(y_test, y_pred, model):
        """
        Build a data frame containing all classification metrics and confusion matrix results.
        """
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() # Convert to 1-D array
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        specificity = tn/(tn+fp)

        all_scores = {'Model Name':[model], 'Precision':[precision],
                    'Recall':[recall], 'F1 Score':[f1],
                    'Specificity':[specificity], 'Accuracy':[acc]}
        df_score = pd.DataFrame(data=all_scores)
        return df_score



In [31]:
get_scores(y_test, predicted_xgb, 'XGBoost, optimized t')

Unnamed: 0,Model Name,Precision,Recall,F1 Score,Specificity,Accuracy
0,"XGBoost, optimized t",0.417823,0.811367,0.551596,0.833022,0.830235


### Make a Predictive System

In [40]:
row_data = data.iloc[0].apply(lambda x: str(x)).tolist()

In [43]:
# Retrieve the first row data as an input example
row_data = data.iloc[0,:-1].apply(lambda x: str(x)).tolist()

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(row_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = xgb_tuned.predict(X_test, ntree_limit=np.argmax(xgb_tuned.predict_proba(X_test)[:,1] >= 0.0935))
print(prediction)

if (prediction[0] == 0):
  print('The respondent is not predicted diabetic')
else:
  print('The repondent is predicted diabetic')

[0 0 0 ... 0 1 0]
The respondent is not predicted diabetic


### Save the trained model

In [35]:
filename = 'diabetes_model.sav'
pickle.dump(xgb_tuned, open(filename, 'wb'))

# Load the saved model
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))

In [46]:
# Retrieve the first row data as an input example
row_data = data.iloc[0,:-1].apply(lambda x: str(x)).tolist()

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(row_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(X_test, ntree_limit=np.argmax(xgb_tuned.predict_proba(X_test)[:,1] >= 0.0935))
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0 0 0 ... 0 1 0]
The person is not diabetic
