## About data
The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome. Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

## About Dataset
**Context**
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

**Content**
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

**Acknowledgements**
Smith, J.W., Everhart, J.E., Dickson, W.C., Knowler, W.C., & Johannes, R.S. (1988). Using the ADAP learning algorithm to forecast the onset of diabetes mellitus. In Proceedings of the Symposium on Computer Applications and Medical Care (pp. 261--265). IEEE Computer Society Press.

**Inspiration**
Can you build a machine learning model to accurately predict whether or not the patients in the dataset have diabetes or not?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.corr()


In [None]:
df.describe().T

 ## Age Group Analysis

In [None]:
# Age Group
df['Age_Group'] = pd.cut(df['Age'], bins=[-1, 30, 50, float('inf')],
                         labels=['Young Adults', 'Middle-Aged', 'Seniors'])
df.head()

## Outlier Analysis

In [None]:
# Outlier Analysis
def outlier_thresholds (dataframe, col_name, q1=0.25, q3=0.75) :
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquartile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquartile_range
    low_limit = quartile1 - 1.5 * interquartile_range
    return low_limit, up_limit

In [None]:
outlier_thresholds(df, 'Insulin')

In [None]:
(-190.875, 318.125)
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    outliers = (dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)
    if outliers.any():
        return True
    else:
        return False

In [None]:
check_outlier(df, 'Insulin')

## Capturing Numerical and Categorical Variables

In [None]:
def num_cat(df):
    num_cols = df.select_dtypes(include="number").columns.to_list()
    num_list = [col for col in df.columns if (df[col].nunique() > 10) & (col in num_cols)]

    cat_list = df.select_dtypes(include="object").columns.to_list()
    cat_list += [col for col in df.columns if (df[col].nunique() < 10) & (col not in cat_list)]

    return num_list,cat_list

num_list,cat_list = num_cat(df)

##  Analyzing Target Variable
Average of numerical variables according to the target variable.

Average of the target variable according to categorical variables.

In [None]:
# Analyze the target variable
df.groupby('Outcome')[num_list].mean()
df.groupby('Outcome')[cat_list].count()

## Correlation Analysis

In [None]:
num_list2 = df.select_dtypes(include=['float64', 'int64'])

def correlation_analysis(dataframe):
    """
    Calculates and visualizes correlations between all numerical variables.

    Parameters
    - Data: Pandas DataFrame, the data set to be analyzed for correlation.
    """
    # Calculate correlations between all variables
    num_list2 = df.select_dtypes(include=['float64', 'int64'])
    corr_matrix = num_list2.corr()

    # Use heatmap for visualization
    plt.figure(figsize=(10, 8))
    mask = np.triu(np.ones_like(corr_matrix))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5,mask=mask )
    plt.xticks(rotation=20)
    plt.yticks(rotation=20)
    plt.title("Correlation Matrix Between All Variables")
    plt.show()

correlation_analysis(num_list2)

In [None]:
def replace_with_thresholds (dataframe, variable) :
    low_limit , up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for col in num_list:
    print(col, check_outlier(df, col))
for col in num_list:
    replace_with_thresholds (df, col)
    print(col, check_outlier(df, col))
for col in num_list:
    print(col, check_outlier (df, col))

In [None]:
df.head(10)

## Handling Missing Values

In [None]:
def replace_zero_with_nan(dataframe, columns):
    """
    Changes all zero values in certain columns to NaN.

    Parameters:
    - dataframe: Pandas DataFrame, the dataset to be processed.
    - columns: List, the column names where you want to make the zero values NaN.
    """
    dataframe.loc[:,columns] = dataframe[columns].replace(0, np.nan)

replace_zero_with_nan(df, columns=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                                   'BMI', 'DiabetesPedigreeFunction', 'Age'])

In [None]:
# We need to fill NaN to be able to build the model.
missing_ones = df.isnull().sum()[df.isnull().sum()>0].index

def fill_based_cat(data,columns,based_cat,metric):
    data = data.copy()
    for col in columns:
        data[col] = data[col].fillna(df.groupby(based_cat)[col].transform(metric))
    return data

df = fill_based_cat(df,missing_ones,based_cat="Age_Group",metric="median")


In [None]:
all(df['Glucose'] == 'NaN')

##  Creating New Variables

In [None]:
# Pregnancy Category
df['Pregnancy_Category'] = pd.cut(df['Pregnancies'], bins=[-1, 0, 1, float('inf')],
                                  labels=['Nulliparous', 'Primiparous', 'Multiparous'])

# Blood Pressure Category
df['BloodPressure_Category'] = pd.cut(df['BloodPressure'], bins=[-1, 80, 90, float('inf')],
                                      labels=['Normal', 'Elevated', 'Hypertensive'])

# BMI Category
df['BMI_Category'] = pd.cut(df['BMI'], bins=[-1, 18.5, 24.9, 29.9, float('inf')],
                            labels=['Underweight', 'Normal Weight', 'Overweight', 'Obese'])

# Insulin Sensitivity
df['Insulin_Sensitivity'] = df['Glucose'] / (df['Insulin'] * df['BMI'])

# Insulin Resistance Index
df['Insulin_Resistance_Index'] = df['Insulin'] * df['Glucose'] / df['BMI']

# Triceps Skin Fold Thickness Indicator
df['Triceps_Skin_Fold_Indicator'] = df['SkinThickness'].apply(lambda x: 1 if 20 <= x <= 30 else 0)


##  Encoding Categorical Variables

In [None]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

df = one_hot_encoder(df, categorical_cols=['Pregnancy_Category', 'BloodPressure_Category',
                                           'BMI_Category' , 'Age_Group'])

## Standardization

In [None]:
scaler = StandardScaler()
df[num_list] = scaler.fit_transform(df[num_list])
df[num_list].head()

##  Splitting the Data

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

## Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

df.head()

## Plotting Feature Importance

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    print(feature_imp.sort_values("Value",ascending=False))
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')

plot_importance(rf_model, X)

## Define features(x) and target(y)

In [None]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

y=y.values.reshape(-1,1)


## Data scaling

In [None]:
y

In [None]:
scaler= MinMaxScaler()
x=scaler.fit_transform(x)
y=scaler.fit_transform(y)

## Data Splitting

In [None]:
len(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1, shuffle=True,random_state=42)

In [None]:
x_train.shape

In [None]:
y_test.shape

## Machine Learning model

## Linner regression

In [None]:
lr_model=LinearRegression()
lr_model.fit(x_train,y_train)

In [None]:
lr_model.score(x_train,y_train)

In [None]:
lr_pred=lr_model.predict(x_test)

In [None]:
print(mean_squared_error(y_test,lr_pred))

In [None]:
print(r2_score(y_test,lr_pred))

## Lasso Regression

In [None]:
lasso=Lasso(alpha=0.0001)
lasso.fit(x_train,y_train)

In [None]:
lasso.score(x_train,y_train)

In [None]:
lasso_pred= lasso.predict(x_test)

In [None]:
print(mean_squared_error(y_test,lasso_pred))

In [None]:
print(r2_score(y_test,lasso_pred))

## Radge Regression

In [None]:
ridge=Ridge(alpha=0.001)
ridge.fit(x_train,y_train)

In [None]:
ridge.score(x_train,y_train)

In [None]:
ridge_pred=ridge.predict(x_test)

In [None]:
print(mean_squared_error(y_test,ridge_pred))

In [None]:
print(r2_score(y_test,ridge_pred))

In [None]:
kmeans = KMeans(n_clusters=4, init='random')
kmeans.fit(df)

In [None]:
pred = kmeans.predict(df)
pred

In [None]:
plt.figure(figsize=(10,10))

plt.scatter(y_test,lr_pred,c='r',alpha=0.6,label='liner Regression')
plt.scatter(y_test,lasso_pred, c='g',alpha=0.5,label='Lasso')
plt.scatter(y_test,ridge_pred,c='b',alpha=0.3, label='Ridge')


plt.legend()
plt.show()


In [None]:
quantitative_data.hist(bins=50, figsize=(20,15))
plt.show()
# variance threshold