In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# Load the dataset
df = pd.read_csv("diabetes.csv")
print(df)

# Frequency of each column
print("\nFrequency of Pregnancies:\n")
print(df["Pregnancies"].value_counts())

print("\nFrequency of Glucose:\n")
print(df["Glucose"].value_counts())

print("\nFrequency of BloodPressure:\n")
print(df["BloodPressure"].value_counts())

print("\nFrequency of SkinThickness:\n")
print(df["SkinThickness"].value_counts())

print("\nFrequency of Insulin:\n")
print(df["Insulin"].value_counts())

print("\nFrequency of BMI:\n")
print(df["BMI"].value_counts())

print("\nFrequency of DiabetesPedigreeFunction:\n")
print(df["DiabetesPedigreeFunction"].value_counts())

print("\nFrequency of Age:\n")
print(df["Age"].value_counts())

print("\nFrequency of Outcome:\n")
print(df["Outcome"].value_counts())

# Descriptive statistics
print("\nMean, Median, Mode, Standard deviation, Skewness, and Kurtosis\n")
print("Mean of Pregnancies:", df["Pregnancies"].mean())
print("Median of Pregnancies:", df["Pregnancies"].median())
print("Mode of Pregnancies:", df["Pregnancies"].mode()[0])
print("Standard Deviation of Pregnancies:", df["Pregnancies"].std())
print("Skewness of Pregnancies:", df["Pregnancies"].skew())
print("Kurtosis of Pregnancies:", df["Pregnancies"].kurt())

# Bivariate Analysis: Linear Regression
print("\nBivariate Analysis: Linear Regression\n")
x = df['Age']
y = df['BMI']
n = np.size(x)
x_mean = np.mean(x)
y_mean = np.mean(y)

Sxy = np.sum(x * y) - n * x_mean * y_mean
Sxx = np.sum(x * x) - n * x_mean * x_mean
b1 = Sxy / Sxx
b0 = y_mean - b1 * x_mean

print('Slope (b1):', b1)
print('Intercept (b0):', b0)

y_pred = b1 * x + b0
print("Mean of Predicted Values (Linear Regression):", y_pred.mean())

# Logistic Regression
print("\nLogistic Regression:\n")
X = df[['Age', 'Pregnancies']]
y = df['Outcome']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating and training the logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Making predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluating the model
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy:.2f}")

# Multiple Regression using statsmodels
print("\nMultiple Regression:\n")
model1 = sm.OLS.from_formula('Pregnancies ~ Age + Outcome', df).fit()
print(model1.summary())


     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  