In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

# Load dataset
df1 = pd.read_csv("diabetes_data_upload.csv")
print(df1)
print()

# Univariate Analysis
# Frequency of Age
print("\nFrequency of Age:")
print(df1["Age"].value_counts())

# Frequency of Gender
print("\nFrequency of Gender:")
print(df1["Gender"].value_counts())

# Frequency of Polyuria
print("\nFrequency of Polyuria:")
print(df1["Polyuria"].value_counts())

# Frequency of Delayed Healing
print("\nFrequency of Delayed Healing:")
print(df1["delayed healing"].value_counts())

# Frequency of Class
print("\nFrequency of Class:")
print(df1["class"].value_counts())

# Mean, Median, Mode, Standard Deviation, Skewness, Kurtosis for Age
print("\nStatistics for Age:")
print("Mean of Age:", df1["Age"].mean())
print("Median of Age:", df1['Age'].median())
print("Mode of Age:", df1["Age"].mode()[0])  # Show only the first mode if multiple
print("Standard Deviation of Age:", df1["Age"].std())
print("Skewness of Age:", df1["Age"].skew())
print("Kurtosis of Age:", df1["Age"].kurt())

# Bivariate Analysis: Linear and Logistic Regression Modelling
print("\nLinear Regression:")
x = df1['Age']
y = df1['Polyuria']

# Calculating slope and intercept for Linear Regression
n = np.size(x)
x_mean = np.mean(x)
y_mean = np.mean(y)
Sxy = np.sum(x * y) - n * x_mean * y_mean
Sxx = np.sum(x * x) - n * x_mean * x_mean
b1 = Sxy / Sxx
b0 = y_mean - b1 * x_mean
print('Slope (b1):', b1)
print('Intercept (b0):', b0)
y_pred = b1 * x + b0
print("Predicted Mean (Linear Regression):", y_pred.mean())

print("\nLogistic Regression:")
X = df1[['Age', 'weakness']]
y = df1['Polyuria']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Making predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluating the model
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy:.2f}")

# Multiple Regression
print("\nMultiple Regression:")
model2 = sm.OLS.from_formula('Age ~ Polyuria + weakness + Polydipsia', df1).fit()
print(model2.summary())

# Comparing models
print("\nPima Dataset Parameters:")
# Assuming model1 is defined elsewhere; otherwise, uncomment the line below
# print(model1.params)

print("\nUCI Dataset Parameters:")
print(model2.params)


     Age  Gender  Polyuria  Polydipsia  sudden weight loss  weakness  \
0     40    Male         0           1                   0         1   
1     58    Male         0           0                   0         1   
2     41    Male         1           0                   0         1   
3     45    Male         0           0                   1         1   
4     60    Male         1           1                   1         1   
..   ...     ...       ...         ...                 ...       ...   
515   39  Female         1           1                   1         0   
516   48  Female         1           1                   1         1   
517   58  Female         1           1                   1         1   
518   32  Female         0           0                   0         1   
519   42    Male         0           0                   0         0   

     Polyphagia  Genital thrush  visual blurring  Itching  Irritability  \
0             0               0                0        1   