<a href="https://colab.research.google.com/github/itzsandip007/data-science-learning/blob/main/Day_7_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
data = load_breast_cancer()
x = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
model1 = LogisticRegression(max_iter=10000)
model1.fit(X_train, y_train)

pred1 = model1.predict(X_test)
acc1 = accuracy_score(y_test, pred1)

print("Accuracy without scaling:", acc1)


Accuracy without scaling: 0.956140350877193


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model2 = LogisticRegression(max_iter=10000)
model2.fit(X_train_scaled, y_train)

pred2 = model2.predict(X_test_scaled)
acc2 = accuracy_score(y_test, pred2)

print("Accuracy with scaling:", acc2)



Accuracy with scaling: 0.9736842105263158


In [9]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression(max_iter=10000)

scores = cross_val_score(
    model,
    X_train_scaled,
    y_train,
    cv=5
)

print("CV scores:", scores)
print("Average CV score:", scores.mean())



CV scores: [0.97802198 0.96703297 1.         0.97802198 0.94505495]
Average CV score: 0.9736263736263737


In [10]:
model_l2 = LogisticRegression(penalty='l2', max_iter=10000)
model_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000)


In [11]:
model_l2.fit(X_train_scaled, y_train)
model_l1.fit(X_train_scaled, y_train)


In [13]:
from sklearn.metrics import accuracy_score

pred_l2 = model_l2.predict(X_test_scaled)
pred_l1 = model_l1.predict(X_test_scaled)

acc_l2 = accuracy_score(y_test, pred_l2)
acc_l1 = accuracy_score(y_test, pred_l1)

print("L2 Accuracy:", acc_l2)
print("L1 Accuracy:", acc_l1)


L2 Accuracy: 0.9736842105263158
L1 Accuracy: 0.9736842105263158


In [14]:
import numpy as np

print("L1 zero coefficients:", np.sum(model_l1.coef_ == 0))
print("Total features:", model_l1.coef_.shape[1])


L1 zero coefficients: 16
Total features: 30
