# Feature Engineering

# Handling Missing Values

# Data Standardization

In [None]:
# Data manipulation and numerical computing libraries
import numpy as np          # NumPy: fundamental package for scientific computing with Python
import pandas as pd         # Pandas: data manipulation and analysis library for structured data

# Machine learning library and utilities
import sklearn.datasets     # Scikit-learn datasets module: provides access to built-in datasets
from sklearn.preprocessing import StandardScaler  # StandardScaler: standardizes features by removing mean and scaling to unit variance
from sklearn.model_selection import train_test_split  # train_test_split: splits datasets into training and testing subsets

In [None]:
# Load the breast cancer dataset
# This is a binary classification dataset with 569 samples and 30 features
# Target: 0 = malignant, 1 = benign
dataset = sklearn.datasets.load_breast_cancer()

In [None]:
print(dataset)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

In [None]:
# Convert dataset to pandas DataFrame for easier data manipulation
# dataset.data contains the feature matrix (569 x 30)
# dataset.feature_names provides descriptive column names for all 30 features
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [None]:
# Display the first 5 rows of the dataset to understand the data structure
# This helps verify the data loaded correctly and shows feature values
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
# Check the dimensions of the dataset (rows, columns)
# Expected output: (569, 30) - 569 samples with 30 features each
df.shape

(569, 30)

In [None]:
# Prepare features and target variables for machine learning
# X: Feature matrix containing all 30 numerical features (independent variables)
X = df
# Y: Target vector containing binary labels (dependent variable)
# 0 = malignant, 1 = benign
Y = dataset.target

In [None]:
print(X)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [None]:
print(Y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 0 

Splitting the data into Training Data and Test data

In [None]:
# Split the dataset into training and testing sets
# 80% for training, 20% for testing (test_size=0.2)
# random_state=3 ensures reproducible results across runs
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
# Verify the shapes after splitting to confirm correct division
# Expected: X(569, 30), X_train(~455, 30), X_test(~114, 30)
print(X.shape, X_train.shape, X_test.shape)

(569, 30) (455, 30) (114, 30)


Standardised the data

In [None]:
# Check the standard deviation of features in the original dataset
# This helps identify if feature scaling is needed (large variations indicate scaling required)
print(dataset.data.std())

228.29740508276657


In [None]:
# Initialize the StandardScaler for feature normalization
# StandardScaler transforms features to have mean=0 and standard deviation=1
# This prevents features with larger scales from dominating the model
scaler = StandardScaler()

In [None]:
# Fit the scaler on training data only
# This calculates the mean and standard deviation for each feature from X_train
# Important: Only use training data to avoid data leakage
scaler.fit(X_train)

In [None]:
# Transform the training data using the fitted scaler
# Applies standardization: (value - mean) / std for each feature
# Result: all features have mean=0 and std=1
X_train_standardised = scaler.transform(X_train)

In [None]:
# Display the standardized training data
# Values should now be centered around 0 with most values between -3 and +3
print(X_train_standardised)

[[ 1.40381088  1.79283426  1.37960065 ...  1.044121    0.52295995
   0.64990763]
 [ 1.16565505 -0.14461158  1.07121375 ...  0.5940779   0.44153782
  -0.85281516]
 [-0.0307278  -0.77271123 -0.09822185 ... -0.64047556 -0.31161687
  -0.69292805]
 ...
 [ 1.06478904  0.20084323  0.89267396 ...  0.01694621  3.06583565
  -1.29952679]
 [ 1.51308238  2.3170559   1.67987211 ...  1.14728703 -0.16599653
   0.82816016]
 [-0.73678981 -1.02636686 -0.74380549 ... -0.31826862 -0.40713129
  -0.38233653]]


In [None]:
# Transform the test data using the same scaler fitted on training data
# Critical: Uses the SAME mean and std from training data (no refitting)
# This ensures consistent scaling and prevents data leakage
X_test_standardised = scaler.transform(X_test)

In [None]:
# Verify standardization worked correctly on training data
# Should show standard deviation ≈ 1.0 for all features after scaling
print(X_train_standardised.std())

1.0


In [None]:
# Check standard deviation of test data after transformation
# May not be exactly 1.0 since scaler was fitted on training data only
# Small variations are normal and expected
print(X_test_standardised.std())

0.8654541077212674


# Label Encoding
Label Encoding:




*   Converting the labels into numeric form

In [None]:
# importing the dependensies

# Import the pandas library and assign it the alias 'pd'.
# Pandas is a powerful data manipulation and analysis library, essential for working with structured data like CSV files.
import pandas as pd

# From the scikit-learn library, import the LabelEncoder class.
# Scikit-learn is a popular machine learning library.
# LabelEncoder is used to convert categorical text data into a numerical format that machine learning models can understand.
from sklearn.preprocessing import LabelEncoder

Label encoding of Breast Cancer Dataset

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]

# Import the kagglehub package and the specific dataset adapter for Pandas
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Define the path to the specific CSV file within the dataset
file_path = "breast-cancer.csv"

# Load the dataset from Kaggle using kagglehub
# - KaggleDatasetAdapter.PANDAS is used to return the data as a Pandas DataFrame
# - "yasserh/breast-cancer-dataset" is the Kaggle dataset slug
# - file_path specifies the particular file to load from that dataset
cancer_data = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "yasserh/breast-cancer-dataset",
  file_path,
  # Optional: Provide additional arguments like sql_query or pandas_kwargs
  # For more customization, see:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

# Display the first 5 rows of the dataset to confirm successful loading
print("First 5 records:", cancer_data.head())


  cancer_data = kagglehub.load_dataset(


First 5 records:          id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimete

In [None]:
# Count the number of occurrences of each unique value in the 'diagnosis' column
# This helps to understand how many benign (B) and malignant (M) cases are present in the dataset
cancer_data['diagnosis'].value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
B,357
M,212


In [None]:
# Load the Label Encoder class from scikit-learn to convert categorical labels into numeric form
label_encode = LabelEncoder()  # Create an instance of LabelEncoder

In [None]:
# Encode the 'diagnosis' column: converts categorical labels ('M' and 'B') into numeric values (e.g., 1 and 0)
labels = label_encode.fit_transform(cancer_data.diagnosis)

In [None]:
# Add the encoded diagnosis labels as a new column named 'target' in the cancer_data DataFrame
# This helps in using numeric values (like 0 and 1) for machine learning models instead of strings ('M' or 'B')
cancer_data['target'] = labels

In [None]:
# Display the first 5 rows of the cancer_data DataFrame to verify the data and the new 'target' column
cancer_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,target
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


0 --> Benign


1 --> Malignant

In [None]:
# Count the number of occurrences of each unique value in the 'target' column (e.g., 0 = benign, 1 = malignant)
cancer_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,357
1,212


# Train Test Split

In [None]:
# Import necessary libraries
import numpy as np  # Used for numerical operations, especially with arrays
import pandas as pd # Used for data manipulation and analysis, particularly with DataFrames

# Import modules for data preprocessing and model selection
from sklearn.preprocessing import StandardScaler # Used for standardizing features by removing the mean and scaling to unit variance
from sklearn.model_selection import train_test_split # Used for splitting data into training and testing sets

# Import the Support Vector Machine (SVM) classifier
from sklearn import svm # Used for creating and training Support Vector Machine models

# Import metrics for evaluating model performance
from sklearn.metrics import accuracy_score # Used to calculate the accuracy of the model's predictions

Data Collection and Analysis


PIMA Diabetes Dataset

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "diabetes.csv"

# Load the latest version
diabetes_dataset = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "uciml/pima-indians-diabetes-database",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)
df = pd.DataFrame(diabetes_dataset)
# print("First 5 records:", df.head())

  diabetes_dataset = kagglehub.load_dataset(


In [None]:
# Display the first 5 rows of the DataFrame
# This is useful for a quick initial inspection of the data's structure and content.
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Display the number of rows and columns in the diabetes_dataset DataFrame.
# The 'shape' attribute returns a tuple representing the dimensions of the DataFrame,
# where the first element is the number of rows and the second is the number of columns.
diabetes_dataset.shape

(768, 9)

In [None]:
# Generate descriptive statistics of the diabetes_dataset DataFrame.
# This method provides a summary of the central tendency, dispersion, and shape
# of the distribution of each numerical column in the DataFrame.
# It includes:
# - count: The number of non-null observations.
# - mean: The average of the values.
# - std: The standard deviation, a measure of the spread of the data.
# - min: The minimum value.
# - 25%: The first quartile (25th percentile).
# - 50%: The median (50th percentile or second quartile).
# - 75%: The third quartile (75th percentile).
# - max: The maximum value.
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# Calculate and display the number of occurrences for each unique value in the 'Outcome' column.
# 'value_counts()' is a pandas Series method that returns a Series containing counts of unique values.
# The 'Outcome' column typically represents the target variable (e.g., 0 for non-diabetic, 1 for diabetic).
# This is crucial for understanding the class distribution (i.e., whether the dataset is balanced or imbalanced).
diabetes_dataset['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


0 --> Non-Diabetic

1 --> Diabetic

In [None]:
# Group the diabetes_dataset DataFrame by the unique values in the 'Outcome' column,
# and then calculate the mean of all other numerical columns for each group.
# This operation effectively shows the average values of features for individuals
# with 'Outcome' 0 (e.g., non-diabetic) versus 'Outcome' 1 (e.g., diabetic).
# It helps in understanding how feature values differ between the two outcome groups,
# providing insights into which features might be more indicative of diabetes.
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [None]:
# Separate the features (X) from the target variable (Y).

# X will contain all columns from 'diabetes_dataset' except the 'Outcome' column.
# 'drop()' method removes the specified column(s).
# 'columns = 'Outcome'' specifies that the 'Outcome' column should be dropped.
# 'axis=1' indicates that we are dropping a column (axis=0 would mean dropping a row).
# This 'X' DataFrame will be used as the input features for our machine learning model.
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)

# Y will contain only the 'Outcome' column from 'diabetes_dataset'.
# This 'Y' Series represents the target labels (i.e., whether a patient has diabetes or not),
# which our machine learning model will try to predict.
Y = diabetes_dataset['Outcome']

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Data Standardisation

In [None]:
# Create an instance of the StandardScaler.
# This object will be used to standardize features by removing the mean
# and scaling to unit variance, a common preprocessing step for many
# machine learning algorithms.
scaler = StandardScaler()

In [None]:
# Fit the StandardScaler to the features data (X).
# This step calculates the mean and standard deviation for each feature (column) in X.
# These calculated values (mean and std dev) will then be stored within the 'scaler' object
# and used later to transform the data (standardize it).
# It's important to fit the scaler only on the training data in a real-world scenario
# to prevent data leakage, but here, it's being fit on the entire X for demonstration or
# specific use cases where the entire dataset is scaled before splitting.
scaler.fit(X)

In [None]:
# Transform the features data (X) using the parameters (mean and standard deviation)
# that were previously learned by the 'scaler' object during the 'fit()' step.
# This operation standardizes the data such that each feature will have a mean of 0
# and a standard deviation of 1.
# The result, 'standardized_data', is a NumPy array containing the scaled features.
standardized_data = scaler.transform(X)

In [None]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
# Assign the standardized numerical features to 'X'.
# This step updates 'X' to now contain the data that has been scaled (mean=0, std=1)
# from the 'standardized_data' array, making it ready for model training.
X = standardized_data

# Re-assign the 'Outcome' column to 'Y'.
# This ensures 'Y' still holds the original, unscaled target labels
# (e.g., 0 for non-diabetic, 1 for diabetic), which are not typically scaled.
Y = diabetes_dataset['Outcome']

In [None]:
print(X)
print(Y)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


SPLITTING THE DATA INTO TRAINING DATA & TEST DATA

In [None]:
# Split the dataset into training and testing sets
# X: Features (independent variables)
# Y: Labels or target (dependent variable)
# test_size=0.2: 20% of the data will be used for testing, and 80% for training
# random_state=2: Ensures reproducibility of the split (same result every time)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Print the shape of the original dataset and the training/test splits
print(X.shape, X_train.shape, X_test.shape)

# X.shape       → Shape of the full features dataset (rows, columns)
# X_train.shape → Shape of the training features (80% of data)
# X_test.shape  → Shape of the testing features (20% of data)

(768, 8) (614, 8) (154, 8)


# Handling Imbalanced dataset

Imbalanced Dataset:
A dataset with an unequal class distribution

In [None]:
# Importing the dependencies

# NumPy is used for numerical operations and handling arrays
import numpy as np

# pandas is used for data manipulation and analysis (especially with tabular data)
import pandas as pd


In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]

# Importing the required module from kagglehub
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the file path to the dataset you want to load from the Kaggle repository
file_path = "creditcard.csv"

# Load the dataset using KaggleHub with the Pandas adapter
credit_card_data = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,       # Adapter to load the data as a Pandas DataFrame
    "mlg-ulb/creditcardfraud",         # Kaggle dataset path (owner/dataset-name)
    file_path                          # Name of the CSV file within the dataset
    # Additional arguments like sql_query or pandas_kwargs can be passed if needed
)

# Convert the loaded data into a pandas DataFrame
ccd = pd.DataFrame(credit_card_data)

# Optional: Print the first 5 rows to verify data loading
# print("First 5 records:", ccd.head())


  credit_card_data = kagglehub.load_dataset(


In [None]:
# Display the first 5 rows of the credit card dataset
# Useful for getting an overview of the structure, column names, and sample data
ccd.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
# Display the last 5 rows of the credit card dataset
# Useful for checking the structure and values at the end of the DataFrame
ccd.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [None]:
# Check the distribution of the two classes in the dataset
# 'Class' column: 0 indicates normal transactions, 1 indicates fraudulent transactions
ccd['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


This is Highly Imbalanced dataset


0 --> Legit Transactions


1 --> Fraudulent Transaction

In [None]:
# Separating the legitimate (non-fraudulent) and fraudulent transactions

# Select all rows where Class is 0 (legit transactions)
legit = ccd[ccd.Class == 0]

# Select all rows where Class is 1 (fraudulent transactions)
fraud = ccd[ccd.Class == 1]


In [None]:
# Print the number of rows and columns in the legit and fraud datasets

print(legit.shape)  # Output: (number_of_legit_transactions, number_of_features)
print(fraud.shape)  # Output: (number_of_fraudulent_transactions, number_of_features)


(284315, 31)
(492, 31)


Under-Sampling

building a sample dataset containing similar distribution of legit & Fraudulent Transactions

Number of Fraudulent transactions --> 492

In [None]:
# Take a random sample of 492 legitimate transactions
# This matches the number of fraudulent transactions
# Helps in creating a balanced dataset for training the model
legit_sample = legit.sample(n=492)

In [None]:
# Print the shape of the sampled legit transactions
# Should be (492, 31), same as the number of fraud cases
print(legit_sample.shape)

(492, 31)


Concatenate the Two Dataframes

In [None]:
# Combine the sampled legitimate transactions and all fraudulent transactions
# axis=0 means we are stacking them vertically (i.e., row-wise)
# This creates a new balanced dataset with equal number of legit and fraud transactions
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [None]:
# Display the first 5 rows of the new balanced dataset
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
56584,47494.0,1.133395,0.851144,0.040117,2.632105,0.334482,-0.365822,0.219204,-0.002403,-0.831407,...,-0.221097,-0.66922,0.07596,-0.081205,0.305565,-0.130429,0.016176,0.051531,7.6,0
114049,73312.0,0.924869,-1.348931,1.106491,-0.337189,-1.540627,0.419971,-1.006455,0.22246,-0.315256,...,0.205882,0.445216,-0.1078,0.289704,0.22701,-0.265287,0.042672,0.044393,173.66,0
105734,69663.0,-0.43805,0.892205,0.708236,-0.036183,-0.310063,-0.38805,0.310806,0.435376,-0.781458,...,0.101397,0.046972,0.114492,0.009897,-0.385494,0.155793,-0.120602,-0.034398,51.5,0
85343,60733.0,-1.106538,0.873061,1.232502,-1.398614,-0.217888,-0.647132,0.627004,-0.064404,0.986856,...,-0.260829,-0.272806,-0.02034,0.083117,-0.235689,0.744166,0.376375,0.030825,38.42,0
246433,153172.0,-0.465614,0.765416,-0.650185,-0.42834,1.159533,-0.715584,1.486294,-0.69902,0.36131,...,0.097363,1.132984,-0.172297,-0.537649,-0.24293,-0.191516,0.011596,-0.225874,69.3,0


In [None]:
# Display the last 5 rows of the new balanced dataset
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.88285,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.29268,0.147968,390.0,1
280143,169347.0,1.378559,1.289381,-5.004247,1.41185,0.442581,-1.326536,-1.41317,0.248525,-1.127396,...,0.370612,0.028234,-0.14564,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
280149,169351.0,-0.676143,1.126366,-2.2137,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.65225,...,0.751826,0.834108,0.190944,0.03207,-0.739695,0.471111,0.385107,0.194361,77.89,1
281144,169966.0,-3.113832,0.585864,-5.39973,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.2537,245.0,1
281674,170348.0,1.991976,0.158476,-2.583441,0.40867,1.151147,-0.096695,0.22305,-0.068384,0.577829,...,-0.16435,-0.295135,-0.072173,-0.450261,0.313267,-0.289617,0.002988,-0.015309,42.53,1


In [None]:
# Check the distribution of classes in the new balanced dataset
# Expecting 492 legitimate (0) and 492 fraudulent (1) transactions
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


# Feature extraction of Text data using Tf-idf Vectorizer

About the Dataset:


1.   id: Unique id for a news article
2.   title: the title of a news article

3.   author: author of the news article
4.   text: the text of the article; could be incomplete

5.   Label: a label that marks whether the news article is real or fake



          1: Fake News
          0: Real News




In [None]:
# Importing the dependencies

# NumPy: Used for numerical operations and working with arrays
import numpy as np

# pandas: Used for data manipulation and analysis (especially for structured/tabular data)
import pandas as pd

# re: Python's built-in module for working with regular expressions (useful for text cleaning)
import re

# stopwords: Common words (like "the", "and", "is") that are usually removed in text preprocessing
from nltk.corpus import stopwords

# PorterStemmer: Reduces words to their root form (e.g., "running" -> "run")
from nltk.stem.porter import PorterStemmer

# TfidfVectorizer: Converts a collection of raw documents into a matrix of TF-IDF features
# Useful for transforming text data into numerical format for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Import the nltk library
import nltk

# Download the list of stopwords (common words to remove during text preprocessing)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Install gdown if not already installed
# !pip install -U gdown

# Import gdown: a Python tool to download files from Google Drive
import gdown

# Google Drive file ID (from a shared file link)
file_id = "1NfymoNrkpfWuZhpwxzoNk4bhVVlU5qiN"

# Construct the direct download URL using the file ID
url = f"https://drive.google.com/uc?id={file_id}"

# Download the file and save it as 'train.csv' in the current directory
# Set quiet=False to show download progress
gdown.download(url, "train.csv", quiet=False)  # You can change the filename if needed


Downloading...
From (original): https://drive.google.com/uc?id=1NfymoNrkpfWuZhpwxzoNk4bhVVlU5qiN
From (redirected): https://drive.google.com/uc?id=1NfymoNrkpfWuZhpwxzoNk4bhVVlU5qiN&confirm=t&uuid=9f8adf75-283b-40f9-84f7-914bcfeffbf1
To: /content/train.csv
100%|██████████| 98.6M/98.6M [00:02<00:00, 48.3MB/s]


'train.csv'

In [None]:
# Print the list of English stopwords provided by NLTK
# Stopwords are common words like "the", "is", "and" that are usually removed in text preprocessing
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Data Pre-Processing

In [None]:
# Load the dataset from the CSV file into a pandas DataFrame
# "train.csv" should be in your current working directory (downloaded via gdown)
news_dataset = pd.read_csv("train.csv")

In [None]:
# Display the number of rows and columns in the dataset
news_dataset.shape

(20800, 5)

In [None]:
# Display the first 5 rows of the news dataset
# Useful for quickly inspecting the structure, column names, and sample data
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
# Count the number of missing (null) values in each column of the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,558
author,1957
text,39
label,0


In [None]:
# Replace all missing (NaN) values in the dataset with empty strings
# This is especially useful in text fields like 'title', 'author', or 'text' where missing entries can be treated as blank
news_dataset = news_dataset.fillna('')

In [None]:
# Merge the 'author' and 'title' columns into a new column called 'content'
# This combines both fields into a single string, which can then be used for text processing
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']

In [None]:
# Print the entire 'content' column which contains combined author and title text
print(news_dataset['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [None]:
# Separating the features (input) and the label (target)

# X contains all columns except 'label' — used as input features for training
X = news_dataset.drop(columns='label', axis=1)

# Y contains only the 'label' column — this is the target variable (0 for real news, 1 for fake news)
Y = news_dataset['label']

In [None]:
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

In [None]:
# Create an instance of the PorterStemmer from NLTK
# This will be used to reduce words to their root form (stemming)
port_stem = PorterStemmer()

In [None]:
# Function to clean and stem text data
def stemming(content):
    # Remove all non-alphabetic characters and replace them with spaces
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)

    # Convert the entire text to lowercase
    stemmed_content = stemmed_content.lower()

    # Split the text into individual words (tokenization)
    stemmed_content = stemmed_content.split()

    # Remove stopwords and apply stemming to each word
    stemmed_content = [
        port_stem.stem(word)
        for word in stemmed_content
        if word not in stopwords.words('english')
    ]

    # Join the processed words back into a single string
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

In [None]:
# Apply the 'stemming' function to every entry in the 'content' column
# This cleans, removes stopwords, and stems the text
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [None]:
print(news_dataset['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [None]:
# Separating the final processed text data (features) and labels (target)

# X will contain the cleaned & stemmed 'content' column as a NumPy array
X = news_dataset['content'].values

# Y will contain the corresponding labels (0 for real news, 1 for fake news) as a NumPy array
Y = news_dataset['label'].values

In [None]:
print(X)

['darrel lucu hous dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exercis balkan'
 'david swanson keep f aliv']


In [None]:
print(Y)

[1 0 1 ... 0 1 1]


In [None]:
Y.shape

(20800,)

Tf-idf

In [None]:
# Convert the cleaned textual data into numerical feature vectors using TF-IDF
vectorizer = TfidfVectorizer()


In [None]:
# Learn the vocabulary and IDF (inverse document frequency) from the entire text data
vectorizer.fit(X)

# Transform the text data into TF-IDF feature vectors using the learned vocabulary
X = vectorizer.transform(X)

In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210687 stored elements and shape (20800, 17128)>
  Coords	Values
  (0, 267)	0.2701012497770876
  (0, 2483)	0.36765196867972083
  (0, 2959)	0.24684501285337127
  (0, 3600)	0.3598939188262558
  (0, 3792)	0.27053324808454915
  (0, 4973)	0.23331696690935097
  (0, 7005)	0.2187416908935914
  (0, 7692)	0.24785219520671598
  (0, 8630)	0.2921251408704368
  (0, 8909)	0.36359638063260746
  (0, 13473)	0.2565896679337956
  (0, 15686)	0.2848506356272864
  (1, 1497)	0.2939891562094648
  (1, 1894)	0.15521974226349364
  (1, 2223)	0.3827320386859759
  (1, 2813)	0.19094574062359204
  (1, 3568)	0.26373768806048464
  (1, 5503)	0.7143299355715573
  (1, 6816)	0.1904660198296849
  (1, 16799)	0.30071745655510157
  (2, 2943)	0.3179886800654691
  (2, 3103)	0.46097489583229645
  (2, 5389)	0.3866530551182615
  (2, 5968)	0.3474613386728292
  (2, 9620)	0.49351492943649944
  :	:
  (20797, 3643)	0.2115550061362374
  (20797, 7042)	0.21799048897828685
  (2079

In [None]:
print(Y)

[1 0 1 ... 0 1 1]


# Numerical Dataset Pre-processing

In [None]:
# Importing the dependencies

# NumPy: Used for numerical operations, especially with arrays
import numpy as np

# pandas: Used for handling tabular data, such as loading and manipulating DataFrames
import pandas as pd

# StandardScaler: Used to scale/normalize numeric features (not usually needed for TF-IDF features, but useful for numeric data)
from sklearn.preprocessing import StandardScaler

# train_test_split: Used to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split


Data Collection & Pre-processing

In [None]:
# Install kagglehub if not already installed
# pip install kagglehub[pandas-datasets]

# Import necessary modules from kagglehub
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the file path of the dataset you want to load
file_path = "diabetes.csv"

# Load the dataset from Kaggle using kagglehub
# Dataset: "mathchi/diabetes-data-set"
# Adapter: KaggleDatasetAdapter.PANDAS returns the data as a Pandas DataFrame
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "mathchi/diabetes-data-set",
  file_path,
)

# Convert the loaded data into a pandas DataFrame for easier manipulation
diabetes_data = pd.DataFrame(df)

# Optional: print the first 5 records to quickly inspect the dataset
# print("First 5 records:", diabetes_data.head())

  df = kagglehub.load_dataset(


In [None]:
# Display the first 5 rows of the diabetes dataset to inspect the structure and sample data
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# Get the number of rows and columns in the diabetes dataset
diabetes_data.shape

(768, 9)

In [None]:
# Get a statistical summary of all numeric columns in the diabetes dataset
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Separating Features & Targets

In [None]:
# Separating the features (input variables) and the target label (output variable)

# X contains all columns **except** the target column 'Outcome'
X = diabetes_data.drop(columns='Outcome', axis=1)

# Y contains only the target column 'Outcome' (0 = non-diabetic, 1 = diabetic)
Y = diabetes_data['Outcome']

In [None]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [None]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


0 --> Non-Diabetic


1 --> Diabetic

Data Standardization

In [None]:
# Create an instance of StandardScaler to standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()

In [None]:
# Fit the StandardScaler on the entire feature set X and transform it
# This scales all features to have zero mean and unit variance
standardized_data = scaler.fit_transform(X)

In [None]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [None]:
X = standardized_data

In [None]:
print(X)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


Splitting the data into Training data & Testing Data

In [None]:
# Split the dataset into training and testing sets
# 80% of data for training, 20% for testing
# random_state=2 ensures reproducibility of the split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Print the shapes of the full feature set, and the test feature set and test label set
print(X.shape, X_test.shape, Y_test.shape)

(768, 8) (154, 8) (154,)
