# Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings
warnings.simplefilter('ignore')

# Download Dataset

# DATA PREPROCESSING

#### Data Processing merupakan proses memahami informasi dalam data dan menentukan kualitas dari data tersebut.



In [2]:
df = pd.read_csv('../diabetes_prediction_dataset_raw.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


### Pengecekan data yang terdindikasi duplikat

In [3]:
df.duplicated().sum()


np.int64(3854)

In [4]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()
df['smoking_history'].value_counts() # 35816 no info so we should drop columns to avoid inaccureate data


smoking_history
never          34398
No Info        32887
former          9299
current         9197
not current     6367
ever            3998
Name: count, dtype: int64

#### Melakukan Drop Column pada kolom smoking_history

In [5]:
df.drop(columns=['smoking_history'],inplace=True) #preprocessing


In [6]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,25.19,6.6,140,0
1,Female,54.0,0,0,27.32,6.6,80,0
2,Male,28.0,0,0,27.32,5.7,158,0
3,Female,36.0,0,0,23.45,5.0,155,0
4,Male,76.0,1,1,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,24.60,4.8,145,0
99996,Female,2.0,0,0,17.37,6.5,100,0
99997,Male,66.0,0,0,27.83,5.7,155,0
99998,Female,24.0,0,0,35.42,4.0,100,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96146 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               96146 non-null  object 
 1   age                  96146 non-null  float64
 2   hypertension         96146 non-null  int64  
 3   heart_disease        96146 non-null  int64  
 4   bmi                  96146 non-null  float64
 5   HbA1c_level          96146 non-null  float64
 6   blood_glucose_level  96146 non-null  int64  
 7   diabetes             96146 non-null  int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 6.6+ MB


# EDA

Exploratory data analysis merupakan proses investigasi awal pada data untuk menganalisis karakteristik, menemukan pola, anomali, dan memeriksa asumsi pada data. Teknik ini biasanya menggunakan bantuan statistik dan representasi grafis atau visualisasi.

In [8]:
cat_features = ['gender']
num_features = ['age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level','diabetes']

In [9]:
encoder = LabelEncoder()
df['gender'] = encoder.fit_transform(df['gender'])

#### Proses untuk mendeteksi dan menghapus outlier (data pencilan) dari fitur numerik.

In [10]:
numeric_columns = df.select_dtypes(include=['number']).columns  # Semua angka
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1
df_clean = df[~((df[numeric_columns] < (Q1 - 1.5 * IQR)) | (df[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]


## Split dataset

#### Pembagian dataset menjadi 80% digunakan untuk training model dan 20% untuk mengevaluasi model.

In [11]:
from sklearn.model_selection import train_test_split
# Menentukan fitur (X) dan label (y)
X = df.drop(["diabetes"],axis =1)
y = df["diabetes"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset sizes
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (76916, 7)
Testing set size: (19230, 7)


## Perubahan dataset ke csv masing-masing

In [12]:
combined_df = pd.concat([X_train, y_train], axis=1)
combined_df.to_csv("diabetes_prediction_dataset_train.csv", index=False)

In [13]:
combined_df_train = pd.concat([X_test, y_test], axis=1)
combined_df_train.to_csv("diabetes_prediction_dataset_test.csv", index=False)

In [14]:
# ubah y ke dataframe
new_y_train = pd.DataFrame(y_train, columns=["diabetes"])
new_y_test = pd.DataFrame(y_test, columns=["diabetes"])

print(X_train.shape, X_test.shape)
print(new_y_train.shape, new_y_test.shape)

(76916, 7) (19230, 7)
(76916, 1) (19230, 1)


In [15]:
new_y_test


Unnamed: 0,diabetes
2547,0
34774,0
71084,1
50584,0
80788,0
...,...
19314,0
17624,0
8673,0
37062,0
