

---


#**Logistic Regression using CSV file - StandardScaler**


---



In [35]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
df = pd.read_csv('diabetes.csv', header=None, names=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Diabetes', 'Age', 'Class'])

In [36]:
# Display first few records
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Diabetes,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [37]:
# Display random sample records
df.sample(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Diabetes,Age,Class
631,0,102,78,40,90,34.5,0.238,24,0
528,0,117,66,31,188,30.8,0.493,22,0


In [38]:
# Dataset shape and information
df.shape

(768, 9)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    768 non-null    int64  
 1   Glucose        768 non-null    int64  
 2   BloodPressure  768 non-null    int64  
 3   SkinThickness  768 non-null    int64  
 4   Insulin        768 non-null    int64  
 5   BMI            768 non-null    float64
 6   Diabetes       768 non-null    float64
 7   Age            768 non-null    int64  
 8   Class          768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [40]:
# Separate features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [41]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     Diabetes  Age  
0       0.627   50  
1       0.351   31  
2       0.672   32  
3       0.167   21  
4     

In [42]:
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Class, Length: 768, dtype: int64


In [43]:
# Standardize the features
sc = StandardScaler()
Xscaled = sc.fit_transform(X)

In [44]:
# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1, test_size=0.25)
xstrain, xstest, ystrain, ystest = train_test_split(Xscaled, y, random_state=1, test_size=0.25)

In [45]:
# Shape of the training and testing sets
xtrain.shape

(576, 8)

In [46]:
xtest.shape

(192, 8)

In [47]:
ytrain.shape

(576,)

In [48]:
ytest.shape

(192,)

In [49]:
# Initialize and train the logistic regression models
model = LogisticRegression()
model2 = LogisticRegression()

In [50]:
model.fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
model2.fit(xstrain, ystrain)

In [52]:
# Predictions
predictions = model.predict(xtest)
predictions2 = model2.predict(xstest)

In [53]:
# Prediction probabilities
predictions_probability = model.predict_proba(xtest)
print(predictions_probability[0:10])

[[0.5782672  0.4217328 ]
 [0.6767115  0.3232885 ]
 [0.84994822 0.15005178]
 [0.95110289 0.04889711]
 [0.79044472 0.20955528]
 [0.71895071 0.28104929]
 [0.62547503 0.37452497]
 [0.89725634 0.10274366]
 [0.84148183 0.15851817]
 [0.79090089 0.20909911]]


In [54]:
predictions_probability2 = model2.predict_proba(xstest)
print(predictions_probability2[0:10])

[[0.57004482 0.42995518]
 [0.68357332 0.31642668]
 [0.85033377 0.14966623]
 [0.94956885 0.05043115]
 [0.78815841 0.21184159]
 [0.72510302 0.27489698]
 [0.62765231 0.37234769]
 [0.89681747 0.10318253]
 [0.84357836 0.15642164]
 [0.79701314 0.20298686]]


In [55]:
# Probability of class 0
predictions_probability_zero = model.predict_proba(xtest)[:, 0]
print(predictions_probability_zero[0:10])

[0.5782672  0.6767115  0.84994822 0.95110289 0.79044472 0.71895071
 0.62547503 0.89725634 0.84148183 0.79090089]


In [56]:
predictions_probability_zero2 = model2.predict_proba(xstest)[:, 0]
print(predictions_probability_zero2[0:10])

[0.57004482 0.68357332 0.85033377 0.94956885 0.78815841 0.72510302
 0.62765231 0.89681747 0.84357836 0.79701314]


In [57]:
# Probability of class 1
predictions_probability_one = model.predict_proba(xtest)[:, 1]
print(predictions_probability_one[0:10])

[0.4217328  0.3232885  0.15005178 0.04889711 0.20955528 0.28104929
 0.37452497 0.10274366 0.15851817 0.20909911]


In [58]:
predictions_probability_one2 = model2.predict_proba(xstest)[:, 1]
print(predictions_probability_one2[0:10])

[0.42995518 0.31642668 0.14966623 0.05043115 0.21184159 0.27489698
 0.37234769 0.10318253 0.15642164 0.20298686]


In [59]:
# Model evaluation
print("Accuracy (without scaling):", accuracy_score(ytest, predictions))
print("Accuracy (with scaling):", accuracy_score(ystest, predictions2))

Accuracy (without scaling): 0.7760416666666666
Accuracy (with scaling): 0.7760416666666666


In [60]:
print("Confusion Matrix (without scaling):\n", confusion_matrix(ytest, predictions))
print("Confusion Matrix (with scaling):\n", confusion_matrix(ystest, predictions2))

Confusion Matrix (without scaling):
 [[109  14]
 [ 29  40]]
Confusion Matrix (with scaling):
 [[109  14]
 [ 29  40]]


In [61]:
print("Classification Report (without scaling):\n", classification_report(ytest, predictions))
print("Classification Report (with scaling):\n", classification_report(ystest, predictions2))


Classification Report (without scaling):
               precision    recall  f1-score   support

           0       0.79      0.89      0.84       123
           1       0.74      0.58      0.65        69

    accuracy                           0.78       192
   macro avg       0.77      0.73      0.74       192
weighted avg       0.77      0.78      0.77       192

Classification Report (with scaling):
               precision    recall  f1-score   support

           0       0.79      0.89      0.84       123
           1       0.74      0.58      0.65        69

    accuracy                           0.78       192
   macro avg       0.77      0.73      0.74       192
weighted avg       0.77      0.78      0.77       192



---

### **Conclusion**

In this project, we implemented a Logistic Regression model to classify individuals as diabetic or non-diabetic using the Diabetes dataset. We tested the model's performance with and without feature scaling using `StandardScaler`.


1.   
The classification reports for both models (with and without scaling) were identical, indicating that feature scaling did not significantly impact the model's performance.
2.   Both models achieved an accuracy of 78%, with similar precision, recall, and f1-scores for both classes.

This result suggests that, for this particular dataset and model, Logistic Regression is robust to the scale of the features.

However, it's important to note that this might not be the case for all datasets or models, and feature scaling should generally be considered as part of the preprocessing pipeline.

---

