In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

# Load dataset
df1 = pd.read_csv("diabetes_data_upload.csv")
print(df1)
print()

# Univariate analysis
# ---> frequency of age
print("\nFrequency of Age:")
print(df1["Age"].value_counts())

# ---> frequency of gender
print("\nFrequency of Gender:")
print(df1["Gender"].value_counts())

# ---> frequency of polyuria
print("\nFrequency of Polyuria:")
print(df1["Polyuria"].value_counts())

# ---> frequency of delayed healing
print("\nFrequency of delayed healing:")
print(df1["delayed healing"].value_counts())  # Corrected: Added parentheses for function call

# ---> frequency of class
print("\nFrequency of class:")
print(df1["class"].value_counts())

# Descriptive statistics (Mean, median, mode, standard deviation, skewness, kurtosis)
print("\nMean, Median, Mode, Standard Deviation, Skewness, Kurtosis:")
print("Mean of Age:", df1["Age"].mean())
print("Median of Age:", df1['Age'].median())
print("Mode of Age:", df1["Age"].mode()[0])  # Corrected to get the first mode value
print("Standard Deviation of Age:", df1["Age"].std())
print("Skewness of Age:", df1["Age"].skew())
print("Kurtosis of Age:", df1["Age"].kurt())

# Bivariate analysis: Linear regression modelling
print("\nLinear Regression:")
x = df1['Age']
y = df1['Polyuria'].apply(lambda x: 1 if x == 'Yes' else 0)  # Converting categorical to numerical for regression

n = np.size(x)
x_mean = np.mean(x)
y_mean = np.mean(y)

Sxy = np.sum(x * y) - n * x_mean * y_mean
Sxx = np.sum(x * x) - n * x_mean * x_mean
b1 = Sxy / Sxx
b0 = y_mean - b1 * x_mean

print('Slope (b1):', b1)
print('Intercept (b0):', b0)

y_pred = b1 * x + b0
print("Predicted values (mean):", y_pred.mean())

# Logistic regression
print("\nLogistic Regression:")
X = df1[['Age', 'weakness'].apply(lambda x: 1 if x == 'Yes' else 0)]  # Converting categorical to numerical
y = df1['Polyuria'].apply(lambda x: 1 if x == 'Yes' else 0)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Making predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluating the model
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy:.2f}")

# Multiple regression using statsmodels
print("\nMultiple Regression:")
df1['Polyuria_numeric'] = df1['Polyuria'].apply(lambda x: 1 if x == 'Yes' else 0)
df1['weakness_numeric'] = df1['weakness'].apply(lambda x: 1 if x == 'Yes' else 0)
df1['Polydipsia_numeric'] = df1['Polydipsia'].apply(lambda x: 1 if x == 'Yes' else 0)

model2 = sm.OLS.from_formula('Age ~ Polyuria_numeric + weakness_numeric + Polydipsia_numeric', df1).fit()
print(model2.summary())

# Comparison with another dataset (e.g., Pima)
print("\nPima Dataset:")
# Assuming model1 is from a different dataset, otherwise remove this part.
# print(model1.params)

print("\nUCI Dataset:")
print(model2.params)


     Age  Gender  Polyuria  Polydipsia  sudden weight loss  weakness  \
0     40    Male         0           1                   0         1   
1     58    Male         0           0                   0         1   
2     41    Male         1           0                   0         1   
3     45    Male         0           0                   1         1   
4     60    Male         1           1                   1         1   
..   ...     ...       ...         ...                 ...       ...   
515   39  Female         1           1                   1         0   
516   48  Female         1           1                   1         1   
517   58  Female         1           1                   1         1   
518   32  Female         0           0                   0         1   
519   42    Male         0           0                   0         0   

     Polyphagia  Genital thrush  visual blurring  Itching  Irritability  \
0             0               0                0        1   

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0