In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

df_diabetes = pd.read_csv(r'C:\Users\Bithi\Downloads\archive (3)\diabetes.csv') 

X = df_diabetes.drop('Outcome', axis=1)
y = df_diabetes['Outcome']

In [2]:
num_records, num_features = df_diabetes.shape
print(f"Number of records: {num_records}")
print(f"Number of features: {num_features}")

# Display the feature names
print("Feature names:")
print(df_diabetes.columns)

# Display the number of records in each category of the target feature 'Outcome'
print("Number of records in each category of 'Outcome':")
print(df_diabetes['Outcome'].value_counts())

# Display information about the dataframe
print("Information about the dataframe:")
print(df_diabetes.info())

# Display the numerical description of the dataframe
print("Numerical description of the dataframe:")
print(df_diabetes.describe())


Number of records: 768
Number of features: 9
Feature names:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
Number of records in each category of 'Outcome':
0    500
1    268
Name: Outcome, dtype: int64
Information about the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64

In [3]:
missing_values = df_diabetes.isnull().sum()
print("Missing values in each feature:")
print(missing_values)

Missing values in each feature:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=2, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)


In [5]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("Actual vs. Predicted:")
print(comparison)

# Evaluation of the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Actual vs. Predicted:
     Actual  Predicted
668       0          0
324       0          0
624       0          0
690       0          0
473       0          0
..      ...        ...
355       1          1
534       0          0
344       0          0
296       1          0
462       0          0

[154 rows x 2 columns]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.93      0.85        99
           1       0.81      0.55      0.65        55

    accuracy                           0.79       154
   macro avg       0.80      0.74      0.75       154
weighted avg       0.80      0.79      0.78       154

Confusion Matrix:
[[92  7]
 [25 30]]


In [6]:
new_record = pd.DataFrame({'Pregnancies': [5],
                            'Glucose': [120],
                            'BloodPressure': [70],
                            'SkinThickness': [30],
                            'Insulin': [50],
                            'BMI': [25],
                            'DiabetesPedigreeFunction': [0.45],
                            'Age': [35]})
new_prediction = rf.predict(new_record)
print("Prediction for a New Unseen Record:")
print(new_prediction)


Prediction for a New Unseen Record:
[0]
