In [21]:
import numpy as np
import pandas as pd


In [22]:
df = pd.read_csv('diabetes.csv')
df.head(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [25]:
df.drop(columns=['Pregnancies'])

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,32,1
3,89,66,23,94,28.1,0.167,21,0
4,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [3]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
#Finding Mean Glucose
def mn(data):
    return (sum(data) / len(data))

In [None]:
a = mn(df.Glucose)
print(a)

120.89453125


In [6]:
#Finding Variance
def variance(data):
    mean = mn(data)
    squared_diff_sum = sum((x - mean) ** 2 for x in data)
    return squared_diff_sum / len(data)

v = variance(df.Glucose)
print(v)

1020.9172617594401


In [7]:
#Finding Standard Deviation

def stddev(data):
    return variance(data) ** 0.5
s = stddev(df.Glucose)
print(s)

31.95179590820272


In [8]:
nc = df["Glucose"].corr(df["Insulin"])
print(nc)

0.3313571099202097


In [9]:
#Finding Covariance

def covariance(x, y):
    x_mean = mn(x)
    y_mean = mn(y)
    covar_sum = sum((x[i] - x_mean) * (y[i] - y_mean) for i in range(len(x)))
    return covar_sum / len(x)
c = covariance(df.Glucose, df.Insulin)
print(c)

1219.3460388183578


In [10]:
#corelation Coeeficient

def correlation(x, y):
    x_std = stddev(x)
    y_std = stddev(y)
    if x_std == 0 or y_std == 0:
        return 0  # Avoid division by zero
    return covariance(x, y) / (x_std * y_std)

print(correlation(df.Glucose, df.Insulin))

0.33135710992020867


In [11]:
#Custom Normalisation

#Min-Max Normalisation
def min_max_normalization(data):
    min_val = min(data)
    max_val = max(data)
    return [(x - min_val) / (max_val - min_val) for x in data]

ndf = min_max_normalization(df.Glucose)
print(ndf)

[0.7437185929648241, 0.4271356783919598, 0.9195979899497487, 0.4472361809045226, 0.6884422110552764, 0.5829145728643216, 0.39195979899497485, 0.5778894472361809, 0.9899497487437185, 0.628140703517588, 0.5527638190954773, 0.8442211055276382, 0.6984924623115578, 0.949748743718593, 0.8341708542713567, 0.5025125628140703, 0.592964824120603, 0.5376884422110553, 0.5175879396984925, 0.5778894472361809, 0.6331658291457286, 0.49748743718592964, 0.9849246231155779, 0.5979899497487438, 0.7185929648241206, 0.628140703517588, 0.7386934673366834, 0.48743718592964824, 0.7286432160804021, 0.5879396984924623, 0.5477386934673367, 0.7939698492462312, 0.44221105527638194, 0.4623115577889447, 0.6130653266331658, 0.5175879396984925, 0.6934673366834171, 0.5125628140703518, 0.45226130653266333, 0.5577889447236181, 0.9045226130653267, 0.6683417085427136, 0.5326633165829145, 0.8592964824120602, 0.7989949748743719, 0.9045226130653267, 0.7336683417085427, 0.35678391959798994, 0.5175879396984925, 0.527638190954773

In [12]:
import pandas as pd
pd.DataFrame(ndf).head(10)

Unnamed: 0,0
0,0.743719
1,0.427136
2,0.919598
3,0.447236
4,0.688442
5,0.582915
6,0.39196
7,0.577889
8,0.98995
9,0.628141


In [13]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [14]:
def z_score_normalization(data):
    mean = mn(data)
    std = stddev(data)
    return [(x - mean) / std for x in data]

ndf2 = z_score_normalization(df.Glucose)
pd.DataFrame(ndf2).head()

Unnamed: 0,0
0,0.848324
1,-1.123396
2,1.943724
3,-0.998208
4,0.504055


In [15]:
def decimal_scaling_normalization(data):
    max_abs = max(abs(x) for x in data)
    j = len(str(int(max_abs)))
    return [x / (10 ** j) for x in data]

ndf3 = decimal_scaling_normalization(df.Glucose)
pd.DataFrame(ndf3).head()

Unnamed: 0,0
0,0.148
1,0.085
2,0.183
3,0.089
4,0.137


In [16]:
print("\nc) Independent Features Analysis:")
print(f"Total number of features: {df.shape[1] - 1}")  


c) Independent Features Analysis:
Total number of features: 8


In [17]:
covariance_matrix = covariance(df.Glucose, df.Insulin)

In [18]:
print(covariance_matrix)

1219.3460388183578


In [19]:
correlation_matrix = correlation(df.Glucose, df.Insulin)
print(correlation_matrix)

0.33135710992020867


In [20]:
num_bins = 4
df['Age_Binned'] = pd.qcut(df['Age'], q=num_bins, labels=['Young', 'Young Adult', 'Middle Aged', 'Senior'])

# Display the bin frequencies
age_bin_counts = df['Age_Binned'].value_counts().sort_index()
print("Age bins and their frequencies:")
print(age_bin_counts)

Age bins and their frequencies:
Age_Binned
Young          219
Young Adult    177
Middle Aged    200
Senior         172
Name: count, dtype: int64
