# Diagnostic du diabete

The Pima are a group of Native Americans living in Arizona. A genetic predisposition allowed this group to survive normally to a diet poor of carbohydrates for years. In the recent years, because of a sudden shift from traditional agricultural crops to processed foods, together with a decline in physical activity, made them develop the highest prevalence of type 2 diabetes and for this reason they have been subject of many studies.

# Dataset
The dataset includes data from 768 women with 8 characteristics, in particular:

- Pregnancies: No. of times pregnant
- Glucose: Plasma Glucose Concentration a 2 hour in an oral glucose tolerance test (mg/dl) A 2-hour value between 140 and 200 mg/dL (7.8 and 11.1 mmol/L) is called impaired glucose tolerance. This is called "pre- diabetes." It means you are at increased risk of developing diabetes over time. A glucose level of 200 mg/dL (11.1 mmol/L) or higher is used to diagnose diabetes.
- Blood Pressure: Diastolic Blood Pressure(mmHg): If Diastolic B.P > 90 means High B.P (High Probability of Diabetes) Diastolic B.P < 60 means low B.P (Less Probability of Diabetes)
Skin Thickness: Triceps Skin Fold Thickness (mm) – A value used to estimate body fat. Normal Triceps SkinFold Thickness in women is 23mm. Higher thickness leads to obesity and chances of diabetes increases.
- Insulin: 2-Hour Serum Insulin (mu U/ml) Normal Insulin Level 16-166 mIU/L Values above this range can be alarming.
BMI: Body Mass Index (weight in kg/ height in m2) Body Mass Index of 18.5 to 25 is within the normal range BMI between 25 and 30 then it falls within the overweight range. A BMI of 30 or over falls within the obese range.
- Diabetes Pedigree Function: It provides information about diabetes history in relatives and genetic relationship of those relatives with patients. Higher Pedigree Function means patient is more likely to have diabetes.
- Age (years)
- Outcome: Class Variable (0 or 1) where ‘0’ denotes patient is not having diabetes and ‘1’ denotes patient having diabetes The dependent variable is whether the patient is having diabetes or not.

In [None]:
# Directive pour afficher les graphiques dans Jupyter
%matplotlib inline

In [None]:
# Pandas : librairie de manipulation de données
# NumPy : librairie de calcul scientifique
# MatPlotLib : librairie de visualisation et graphiques
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("../input/diabete.csv")

In [None]:
df.head(30)

In [None]:
df.diabete.value_counts()

In [None]:
df.columns

In [None]:
cont_features = ['glucose', 'tension', 'thickness', 'insulin', 'bmi',
       'pedigree', 'age']
discrete_features = ['n_pregnant']

In [None]:
df1 = df.replace(0,np.nan)
df1.n_pregnant=df.n_pregnant
df1.diabete=df.diabete

In [None]:
df1.head(15)

In [None]:
#sns.pairplot(df, hue="diabete")

In [None]:
for col in cont_features :
    plt.figure(figsize=[10,5])
    sns.distplot(df[col])

In [None]:
df1 = df.replace(0,np.nan)

In [None]:
df1.n_pregnant=df.n_pregnant

In [None]:
df1.diabete=df.diabete

In [None]:
!pip3 install --user missingno

In [None]:
import missingno as msno

In [None]:
msno.bar(df1)

In [None]:
msno.matrix(df1)

In [None]:
df1.count()

In [None]:
values={'glucose':df1.glucose.mean(), 'tension':df1.tension.mean(), 'thickness':df1.thickness.mean(), 'bmi':df1.bmi.mean()}

In [None]:
df1=df1.fillna(value=values)

In [None]:
df1.info()

In [None]:
df_insulin_nan = df1[np.isnan(df1.insulin)].copy()

In [None]:
df_insulin = df1.drop(df_insulin_nan.index)

In [None]:
X = df_insulin.drop(['insulin'], axis=1)
y = df_insulin.insulin
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print(rf.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_rf)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)

In [None]:
X_nan = df_insulin_nan.drop(['insulin'], axis=1)

In [None]:
y_nan = rf.predict(X_nan)

In [None]:
df_insulin_nan['insulin'] = y_nan

In [None]:
df1 = pd.concat([df_insulin, df_insulin_nan], ignore_index=True, sort=False)

In [None]:
df1.head()

In [None]:
df1.describe()

In [None]:
scaler = preprocessing.StandardScaler()
df1[['glucose', 'tension','thickness','insulin','bmi','pedigree']] = scaler.fit_transform(df1[['glucose', 'tension','thickness','insulin','bmi','pedigree']])

In [None]:
df1.describe()

In [None]:
scaler = preprocessing.MinMaxScaler()
df1[['glucose', 'tension','thickness','insulin','bmi','pedigree']] = scaler.fit_transform(df1[['glucose', 'tension','thickness','insulin','bmi','pedigree']])

In [None]:
df1.describe()

In [None]:
X = df1.drop(['diabete'], axis=1)
y = df1.diabete
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
for col in cont_features :
    plt.figure(figsize=[10,5])
    sns.distplot(df1[col])

In [None]:
from sklearn import ensemble

## Régression logistique

In [None]:
#lr = LogisticRegression()
lr = ensemble.RandomForestClassifier(n_estimators=100)
lr.fit(X_train,y_train)
y_lr = lr.predict(X_test)

In [None]:
lr_score = metrics.accuracy_score(y_test, y_lr)
print(lr_score)

In [None]:
print(metrics.classification_report(y_test, y_lr))

In [None]:
cm = metrics.confusion_matrix(y_test, y_lr)
print(cm)

In [None]:
probas = lr.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
plt.figure(figsize=(12,12))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')        # plus mauvaise courbe
plt.plot([0,0,1],[0,1,1],'g:')     # meilleure courbe
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

## Sous échantillonnage

Pour installer le package Imbalanced learn :
- lancer "Anaconda prompt" (du package Anaconda)
- exécuter "pip install imbalanced-learn"

In [None]:
!pip3 install imblearn

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()

X_rus, y_rus = rus.fit_sample(X_train, y_train)

In [None]:
print(X_rus.shape)
print(y_rus.shape)

In [None]:
lr = LogisticRegression(solver="lbfgs")
lr.fit(X_rus,y_rus)
y_lr = lr.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_lr))

In [None]:
cm = metrics.confusion_matrix(y_test, y_lr)
print(cm)

In [None]:
probas = lr.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
plt.figure(figsize=(12,12))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')        # plus mauvaise courbe
plt.plot([0,0,1],[0,1,1],'g:')     # meilleure courbe
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
print(metrics.classification_report(y_test,y_lr))

La méthode Tomek consiste à sélectionner les données à éliminer en privilégiant les instances proches de la classe minoritaire

<img src="https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/tomek.png?v=2">

In [None]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(return_indices=True, ratio='majority')
X_tl, y_tl, i_tl = tl.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver="lbfgs")
lr.fit(X_tl,y_tl)
y_lr = lr.predict(X_test)

In [None]:
probas = lr.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
print(metrics.classification_report(y_test,y_lr))

## Suréchantillonnage

La méthode SMOTE (Synthetic Minority Oversampling TEchnique) consiste à synthétiser des éléments pour la classe minoritaire, à partir de ceux qui existent déjà. Elle fonctionne en choisissant au hasard un point de la classe minoritaire et en calculant les k-voisins les plus proches pour ce point. Les points synthétiques sont ajoutés entre le point choisi et ses voisins.

<img src="https://raw.githubusercontent.com/rafjaa/machine_learning_fecib/master/src/static/img/smote.png">

<img src="https://miro.medium.com/max/850/1*6UFpLFl59O9e3e38ffTXJQ.png">

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver="lbfgs")
lr.fit(X_sm,y_sm)
y_lr = lr.predict(X_test)

In [None]:
probas = lr.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
print(metrics.classification_report(y_test,y_lr))

On peut aussi combiner un suréchantillonnage SMOTE et un sous-échantillonnage Tomek

In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(ratio='auto')
X_smt, y_smt = smt.fit_sample(X_train, y_train)

In [None]:
lr = LogisticRegression(solver="lbfgs")
lr.fit(X_smt,y_smt)
y_lr = lr.predict(X_test)

In [None]:
probas = lr.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(ratio='minority')
X_ad, y_ad = smote.fit_sample(X_train, y_train)
lr = LogisticRegression(solver="lbfgs")
lr.fit(X_ad,y_ad)
y_lr = lr.predict(X_test)
probas = lr.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)
print(metrics.classification_report(y_test,y_lr))

In [None]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(est, X_train, y_train) :
    train_sizes, train_scores, test_scores = learning_curve(estimator=est, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=5,
                                                        n_jobs=-1)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(8,10))
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
    plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
    plt.grid(b='on')
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.6, 1.0])
    plt.show()

In [None]:
plot_learning_curve(lr, X_train, y_train)