# Importing Libraries

In [51]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sn
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# importing the Dataset

In [19]:
df = pd.read_csv('C:/Users/etulyon1/Desktop/Data-Science/Meriskill/Project2/data/Project 2 MeriSKILL.zip', compression='zip')
pd.set_option("display.max_columns", None)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Exploratory Analysis

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [27]:
for column in df.columns:
    null = df.isnull().sum()
    print (f'{column} : {null}')

Pregnancies : Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Glucose : Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
BloodPressure : Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
SkinThickness : Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinT

## Calculating the correlation between the variables

Determining the correlation and the significance

In [35]:
rho = df.corr()
pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
rho.round(2).astype(str) + p


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0***,0.13***,0.14***,-0.08*,-0.07*,0.02,-0.03,0.54***,0.22***
Glucose,0.13***,1.0***,0.15***,0.06,0.33***,0.22***,0.14***,0.26***,0.47***
BloodPressure,0.14***,0.15***,1.0***,0.21***,0.09*,0.28***,0.04,0.24***,0.07
SkinThickness,-0.08*,0.06,0.21***,1.0***,0.44***,0.39***,0.18***,-0.11**,0.07*
Insulin,-0.07*,0.33***,0.09*,0.44***,1.0***,0.2***,0.19***,-0.04,0.13***
BMI,0.02,0.22***,0.28***,0.39***,0.2***,1.0***,0.14***,0.04,0.29***
DiabetesPedigreeFunction,-0.03,0.14***,0.04,0.18***,0.19***,0.14***,1.0***,0.03,0.17***
Age,0.54***,0.26***,0.24***,-0.11**,-0.04,0.04,0.03,1.0***,0.24***
Outcome,0.22***,0.47***,0.07,0.07*,0.13***,0.29***,0.17***,0.24***,1.0***


In [30]:
fig = px.imshow(df.corr())
fig.show()

# Econometric Analysis

Running Logistic Regression

In [50]:
# Assuming df is your DataFrame
y = df['Outcome']
X = df[['Age', 'Insulin', 'SkinThickness', 'Pregnancies', 'Glucose', 'BloodPressure']]

# Correcting the order of arguments in train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Creating an instance of Logistic Regression
lg = LogisticRegression()

# Fitting the model with training data
lg.fit(X_train, y_train)

# Making predictions on the test set
y_pred = lg.predict(X_test)

# Now you can evaluate the performance of your model, for example, by checking accuracy

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# If you want more detailed metrics, you can use classification_report


print(classification_report(y_test, y_pred))


Accuracy: 0.7337662337662337
              precision    recall  f1-score   support

           0       0.77      0.83      0.80        99
           1       0.65      0.56      0.60        55

    accuracy                           0.73       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154



In [62]:
for i in range(0,len(lg.coef_)):
    print (f"{lg.coef_[i]} \n")

[ 0.02944224 -0.00179026  0.02010312  0.06127981  0.03728382 -0.00714101] 



Estimation using the binary logit model

In [58]:
# Add a constant to the features matrix for the intercept
X_train = sm.add_constant(X_train)

# Fit logistic regression model
model = sm.Logit(y_train, X_train)
result = model.fit()

# Print summary
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.506263
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      607
Method:                           MLE   Df Model:                            6
Date:                Fri, 10 Nov 2023   Pseudo R-squ.:                  0.2157
Time:                        16:34:17   Log-Likelihood:                -310.85
converged:                       True   LL-Null:                       -396.34
Covariance Type:            nonrobust   LLR p-value:                 2.767e-34
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -6.2756      0.623    -10.072      0.000      -7.497      -5.054
Age               0.

Calculating marginal effects

In [57]:
print("\nMarginal Effects:")
marginal_effects = result.get_margeff()
marginal_effects_table = marginal_effects.summary_frame()
marginal_effects_table


Marginal Effects:


Unnamed: 0,dy/dx,Std. Err.,z,Pr(>|z|),Conf. Int. Low,Cont. Int. Hi.
Age,0.004905,0.001735,2.827456,0.004691943,0.001505,0.008304
Insulin,-0.000298,0.000161,-1.857935,0.0631783,-0.000613,1.6e-05
SkinThickness,0.00335,0.001202,2.78661,0.005326256,0.000994,0.005706
Pregnancies,0.010223,0.005785,1.767282,0.07718093,-0.001115,0.021561
Glucose,0.006213,0.000506,12.289718,1.02864e-34,0.005222,0.007204
BloodPressure,-0.00119,0.000926,-1.285602,0.198582,-0.003005,0.000624
