In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Diabetes Project ML/diabetes_prediction_dataset.csv")

# Replace 'Male' with 0 and 'Female' with 1 in the 'gender' column
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# Mapping string values in 'smoking_history' column to numerical values
smoking_history_mapping = {'never': 0, 'No Info': 1, 'current': 2, 'former': 3, 'ever': 4, 'not current': 5}
df['smoking_history'] = df['smoking_history'].map(smoking_history_mapping)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features (x) and target (y)
x = df.drop(columns=['diabetes'])
y = df['diabetes']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)
x_test_scaled = scaler.transform(x_test_imputed)

# Fit the training dataset into the Decision Tree model
model = DecisionTreeClassifier(criterion='entropy', random_state=0)
model.fit(x_train_scaled, y_train)

# Make predictions
predictions = model.predict(x_test_scaled)
print("Accuracy:", accuracy_score(y_test, predictions))


Accuracy: 0.953525587597933


In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Diabetes Project ML/diabetes_prediction_dataset.csv")

# Replace 'Male' with 0 and 'Female' with 1 in the 'gender' column
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# Mapping string values in 'smoking_history' column to numerical values
smoking_history_mapping = {'never': 0, 'No Info': 1, 'current': 2, 'former': 3, 'ever': 4, 'not current': 5}
df['smoking_history'] = df['smoking_history'].map(smoking_history_mapping)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features (x) and target (y)
x = df.drop(columns=['diabetes'])
y = df['diabetes']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)
x_test_scaled = scaler.transform(x_test_imputed)

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)

# Fit the training dataset into the Random Forest model
rf_model.fit(x_train_scaled, y_train)

# Make predictions
predictions_rf = rf_model.predict(x_test_scaled)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, predictions_rf)
print("Random Forest Accuracy:", accuracy_rf)


Random Forest Accuracy: 0.9708618103017169


In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.impute import SimpleImputer

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Diabetes Project ML/diabetes_prediction_dataset.csv")

# Replace 'Male' with 0 and 'Female' with 1 in the 'gender' column
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# Mapping string values in 'smoking_history' column to numerical values
smoking_history_mapping = {'never': 0, 'No Info': 1, 'current': 2, 'former': 3, 'ever': 4, 'not current': 5}
df['smoking_history'] = df['smoking_history'].map(smoking_history_mapping)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features (x) and target (y)
x = df.drop(columns=['diabetes'])
y = df['diabetes']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)
x_test_scaled = scaler.transform(x_test_imputed)

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=0)

# Fit the training dataset into the XGBoost model
xgb_model.fit(x_train_scaled, y_train)

# Make predictions
predictions_xgb = xgb_model.predict(x_test_scaled)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
print("XGBoost Accuracy:", accuracy_xgb)


XGBoost Accuracy: 0.9715285880980163


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
import joblib

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Diabetes Project ML/diabetes_prediction_dataset.csv")

# Replace 'Male' with 0 and 'Female' with 1 in the 'gender' column
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# Mapping string values in 'smoking_history' column to numerical values
smoking_history_mapping = {'never': 0, 'No Info': 1, 'current': 2, 'former': 3, 'ever': 4, 'not current': 5}
df['smoking_history'] = df['smoking_history'].map(smoking_history_mapping)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features (x) and target (y)
x = df.drop(columns=['diabetes'])
y = df['diabetes']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)
x_test_scaled = scaler.transform(x_test_imputed)

# Initialize base estimator (Decision Tree)
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=0)

# Initialize AdaBoost classifier
ada_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=0)

# Fit the training dataset into the AdaBoost model
ada_model.fit(x_train_scaled, y_train)

# Make predictions
predictions_ada = ada_model.predict(x_test_scaled)


filename = '/content/drive/MyDrive/Diabetes Project ML/finalizedModel.sav'
joblib.dump(ada_model,filename)
# Calculate accuracy
accuracy_ada = accuracy_score(y_test, predictions_ada)
print("AdaBoost Accuracy:", accuracy_ada)






AdaBoost Accuracy: 0.9720620103350558


In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
import joblib

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/Diabetes Project ML/diabetes_prediction_dataset.csv")

# Replace 'Male' with 0 and 'Female' with 1 in the 'gender' column
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

# Mapping string values in 'smoking_history' column to numerical values
smoking_history_mapping = {'never': 0, 'No Info': 1, 'current': 2, 'former': 3, 'ever': 4, 'not current': 5}
df['smoking_history'] = df['smoking_history'].map(smoking_history_mapping)

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features (x) and target (y)
x = df.drop(columns=['diabetes'])
y = df['diabetes']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=1)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Standardize the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)
x_test_scaled = scaler.transform(x_test_imputed)

# Initialize base estimator (Decision Tree)
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=0)

# Initialize AdaBoost classifier
ada_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=0)

# Fit the training dataset into the AdaBoost model
ada_model.fit(x_train_scaled, y_train)

# Save the trained model
joblib.dump(ada_model, '/content/drive/MyDrive/Diabetes Project ML/ada_model.sav')

# Make predictions
predictions_ada = ada_model.predict(x_test_scaled)

# Calculate accuracy
accuracy_ada = accuracy_score(y_test, predictions_ada)
print("AdaBoost Accuracy:", accuracy_ada)




AdaBoost Accuracy: 0.9720620103350558


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
