In [1]:
import pandas as pd

# Load the training dataset
file_path = 'train.csv'
train_data = pd.read_csv(file_path)

train_data.head()


Unnamed: 0,p_id,no_times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes pedigree,age,diabetes
0,316,2,112,68,22,94,34.1,0.315,26,0
1,25,11,143,94,33,146,36.6,0.254,51,1
2,710,2,93,64,32,160,38.0,0.674,23,1
3,658,1,120,80,48,200,38.9,1.162,41,0
4,542,3,128,72,25,190,32.4,0.549,27,1


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Preparing the data
X = train_data.drop(['diabetes', 'p_id'], axis=1)  # Features
y = train_data['diabetes']  # Target variable

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Creating a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

# Predicting on the validation set
y_pred = clf.predict(X_val_scaled)

# Evaluating the model
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

accuracy

0.8048780487804879

In [3]:
report

'              precision    recall  f1-score   support\n\n           0       0.79      0.94      0.86        77\n           1       0.84      0.59      0.69        46\n\n    accuracy                           0.80       123\n   macro avg       0.82      0.76      0.77       123\nweighted avg       0.81      0.80      0.80       123\n'

In [4]:
# Load the test dataset
test_file_path = 'test.csv'
test_data = pd.read_csv(test_file_path)

test_data.head()

Unnamed: 0,p_id,no_times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes pedigree,age
0,437,12,140,85,33,0,37.4,0.244,41
1,411,6,102,90,39,0,35.7,0.674,28
2,639,7,97,76,32,91,40.9,0.871,32
3,213,7,179,95,31,0,34.2,0.164,60
4,181,6,87,80,0,0,23.2,0.084,32


In [5]:
# Preparing the test data (excluding the patient ID)
X_test = test_data.drop(['p_id'], axis=1)

# Standardizing the test data
X_test_scaled = scaler.transform(X_test)

# Predicting diabetes for the test data
test_predictions = clf.predict(X_test_scaled)

# Adding predictions to the test dataset for review
test_data['predicted_diabetes'] = test_predictions
test_data.head()


Unnamed: 0,p_id,no_times_pregnant,glucose_concentration,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes pedigree,age,predicted_diabetes
0,437,12,140,85,33,0,37.4,0.244,41,1
1,411,6,102,90,39,0,35.7,0.674,28,0
2,639,7,97,76,32,91,40.9,0.871,32,0
3,213,7,179,95,31,0,34.2,0.164,60,1
4,181,6,87,80,0,0,23.2,0.084,32,0


In [6]:
# Formatting the predictions in the required format
submission = test_data[['p_id', 'predicted_diabetes']]

submission.head()

Unnamed: 0,p_id,predicted_diabetes
0,437,1
1,411,0
2,639,0
3,213,1
4,181,0


In [7]:
# Saving the formatted predictions to a CSV file
submission_file_path = 'diabetes_predictions_submission.csv'
submission.to_csv(submission_file_path, index=False)

submission_file_path

'diabetes_predictions_submission.csv'