# Data cleaning

In [183]:
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.display import display

In [184]:
data=pd.read_csv("diabetes.csv")

In [185]:
display(data.info(),
        data.describe(),
        data.size,
        data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


None

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


6912

(768, 9)

In [186]:
print("NAN")
nan_counts = data.isna().sum()
print(nan_counts[nan_counts > 0])

print("\nNULL")
null_counts = data.isnull().sum()
print(null_counts[null_counts > 0])

NAN
Series([], dtype: int64)

NULL
Series([], dtype: int64)


In [187]:
for col in data.select_dtypes(include=['int64']).columns:
    data[col]=data[col].astype(np.int8)
for col in data.select_dtypes(include=['float64']).columns:
    data[col]=data[col].astype(np.float32)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int8   
 1   Glucose                   768 non-null    int8   
 2   BloodPressure             768 non-null    int8   
 3   SkinThickness             768 non-null    int8   
 4   Insulin                   768 non-null    int8   
 5   BMI                       768 non-null    float32
 6   DiabetesPedigreeFunction  768 non-null    float32
 7   Age                       768 non-null    int8   
 8   Outcome                   768 non-null    int8   
dtypes: float32(2), int8(7)
memory usage: 11.4 KB


# Data resampling

## Scalling Data

In [188]:
from sklearn.preprocessing import StandardScaler
def scaled_data(data):
    data_copy = data.copy()
    scaler = StandardScaler()
    data_copy[data.columns] = scaler.fit_transform(data_copy[data.columns]) 
    return data_copy
Sdata= scaled_data(data)

## Splitting data

In [189]:
x=Sdata.iloc[:,:-1]
y=data.iloc[:,-1]

In [190]:
y.value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [191]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [192]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=1)

## Undersampling
- Reduces the majority class by removing samples to balance class distribution.
- Can discard potentially useful information but is efficient for large datasets.

Random Undersampling
- Randomly removes samples from the majority class until a desired balance is achieved. 
- Simple but can discard potentially useful information.

In [193]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
0    213
1    213
Name: count, dtype: int64

Tomek Links
- Identifies and removes majority class samples that form Tomek links (pairs of closest instances from different classes). 
- Helps clean decision boundaries.

In [194]:
from imblearn.under_sampling import TomekLinks

# Create a TomekLinks undersampler
tl = TomekLinks(sampling_strategy='auto')
X_resampled, y_resampled = tl.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
0    354
1    213
Name: count, dtype: int64

NearMiss
- Selects samples from the majority class based on distances to minority class instances. 
- Helps retain most informative majority samples.

In [195]:
from imblearn.under_sampling import NearMiss

# NearMiss-1: Selects majority samples whose average distance to 3 closest minority samples is smallest
nm1 = NearMiss(version=1, n_neighbors=3, sampling_strategy='auto')
X_resampled, y_resampled = nm1.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
0    213
1    213
Name: count, dtype: int64

## Oversampling
- Increases the minority class by duplicating or creating synthetic samples. 
- Preserves all majority information but may lead to overfitting.

SMOTE (Synthetic Minority Oversampling Technique)
- Creates synthetic minority samples by interpolating between existing minority instances. 
- Helps address imbalance without simple duplication.

In [196]:
from imblearn.over_sampling import SMOTE

# Create a SMOTE oversampler
smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
1    401
0    401
Name: count, dtype: int64

ADASYN (Adaptive Synthetic Sampling)
- Similar to SMOTE but generates more synthetic samples for minority instances that are harder to learn. 
- Focuses on difficult boundary regions.

In [204]:
from imblearn.over_sampling import ADASYN

# Create an ADASYN oversampler
adasyn = ADASYN(sampling_strategy='auto', n_neighbors=5, random_state=42)

X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
0    401
1    392
Name: count, dtype: int64

Borderline-SMOTE
- Focuses on minority samples near the decision boundary to create synthetic samples in harder-to-learn regions. 
- More targeted than standard SMOTE.

In [198]:
from imblearn.over_sampling import BorderlineSMOTE

# Create a Borderline-SMOTE oversampler
b_smote = BorderlineSMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
X_resampled, y_resampled = b_smote.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
1    401
0    401
Name: count, dtype: int64

## Combination
Apply both oversampling and undersampling sequentially to balance classes while cleaning noisy samples and decision boundaries.

SMOTETomek
- First applies SMOTE oversampling, then cleans the resulting data using Tomek links. 
- Provides balanced data with cleaner boundaries.

In [199]:
from imblearn.combine import SMOTETomek

# Create a SMOTE+Tomek Links combined resampler
smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
1    378
0    378
Name: count, dtype: int64

SMOTEENN
- Combines SMOTE oversampling with Edited Nearest Neighbors cleaning. 
- More aggressive cleaning than SMOTETomek, often produces better boundaries.

In [200]:
from imblearn.combine import SMOTEENN

# Create a SMOTE+ENN combined resampler
smote_enn = SMOTEENN(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
y_resampled.value_counts()

Outcome
1    250
0    191
Name: count, dtype: int64

## Cross Validation
- Cross-validation involves partitioning data into subsets, training models on some subsets, and validating on others. 
- This helps assess how well a model generalizes to independent data.

In [201]:
X_train_arr=X_train.values
y_train_arr=y_train.values

Stratified K-Fold Cross-Validation
- Maintains the same class distribution in each fold as in the complete dataset. 
- Essential for imbalanced classification evaluation.

In [202]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np


# Create stratified k-fold cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Initialize scores list
scores = []

# Perform cross-validation
for train_idx, val_idx in skf.split(X_train_arr, y_train_arr):
    # Split data for this fold
    X_fold_train, X_fold_val = X_train_arr[train_idx], X_train_arr[val_idx]
    y_fold_train, y_fold_val = y_train_arr[train_idx], y_train_arr[val_idx]
    
    # Train model
    model = RandomForestClassifier(random_state=1)
    model.fit(X_fold_train, y_fold_train)
    
    # Evaluate and store score
    score = model.score(X_fold_val, y_fold_val)
    scores.append(score)

# Calculate average performance
mean_score = np.mean(scores)
std_score = np.std(scores)
print(f"Cross-validation accuracy: {mean_score:.4f} ± {std_score:.4f}")

Cross-validation accuracy: 0.7411 ± 0.0237


TimeSeriesSplit
- Respects temporal order in time series data by using past observations for training and future observations for validation. 
- Critical for time-dependent data

In [203]:
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# Create time series cross-validator
tscv = TimeSeriesSplit(n_splits=5)

# Initialize scores list
scores = []

# Perform time series cross-validation
for train_idx, val_idx in tscv.split(X_train_arr):
    # Split data for this fold
    X_fold_train, X_fold_val = X_train_arr[train_idx], X_train_arr[val_idx]
    y_fold_train, y_fold_val = y_train_arr[train_idx], y_train_arr[val_idx]
    
    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_fold_train, y_fold_train)
    
    # Evaluate and store score
    score = model.score(X_fold_val, y_fold_val)
    scores.append(score)

# Calculate average performance
mean_score = np.mean(scores)
std_score = np.std(scores)
print(f"Time series CV accuracy: {mean_score:.4f} ± {std_score:.4f}")

Time series CV accuracy: 0.7353 ± 0.0206
