# **Logistic Regression using Pandas**

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Load the dataset
dataset = load_breast_cancer()

In [3]:
# Create a DataFrame
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['class'] = dataset.target

In [5]:
# Display basic information
print("Dataset Shape:", df.shape)

Dataset Shape: (569, 31)


In [6]:
print("Sample Data:\n", df.sample(2))

Sample Data:
      mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
403        12.94         16.17           83.18      507.6          0.09879   
484        15.73         11.28          102.80      747.2          0.10430   

     mean compactness  mean concavity  mean concave points  mean symmetry  \
403           0.08836         0.03296              0.02390         0.1735   
484           0.12990         0.11910              0.06211         0.1784   

     mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
403                 0.06200  ...          23.02            89.69       580.9   
484                 0.06259  ...          14.20           112.50       854.3   

     worst smoothness  worst compactness  worst concavity  \
403            0.1172             0.1958           0.1810   
484            0.1541             0.2979           0.4004   

     worst concave points  worst symmetry  worst fractal dimension  class  
403               

In [7]:
# Define features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [8]:
print(X)

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mea

In [9]:
print(y)

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: class, Length: 569, dtype: int64


In [10]:
# Split the dataset into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

In [11]:
# Create and train the Logistic Regression model
classifier = LogisticRegression(max_iter=10000)  # Increase max_iter if the model doesn't converge
classifier.fit(xtrain, ytrain)

In [12]:
# Make predictions
predictions = classifier.predict(xtest)
predictions_probability = classifier.predict_proba(xtest)

In [13]:
# Display prediction probabilities
print("Predictions Probability (first 10):\n", predictions_probability[:10])
print("Predictions Probability for class 0 (first 10):\n", predictions_probability[:, 0][:10])
print("Predictions Probability for class 1 (first 10):\n", predictions_probability[:, 1][:10])

Predictions Probability (first 10):
 [[6.17471519e-01 3.82528481e-01]
 [6.40740990e-01 3.59259010e-01]
 [9.87740233e-04 9.99012260e-01]
 [9.93182070e-01 6.81792999e-03]
 [8.63438299e-01 1.36561701e-01]
 [9.99417170e-01 5.82830094e-04]
 [9.99714875e-01 2.85124843e-04]
 [9.49306511e-01 5.06934888e-02]
 [2.47727832e-04 9.99752272e-01]
 [1.00624225e-02 9.89937577e-01]]
Predictions Probability for class 0 (first 10):
 [6.17471519e-01 6.40740990e-01 9.87740233e-04 9.93182070e-01
 8.63438299e-01 9.99417170e-01 9.99714875e-01 9.49306511e-01
 2.47727832e-04 1.00624225e-02]
Predictions Probability for class 1 (first 10):
 [3.82528481e-01 3.59259010e-01 9.99012260e-01 6.81792999e-03
 1.36561701e-01 5.82830094e-04 2.85124843e-04 5.06934888e-02
 9.99752272e-01 9.89937577e-01]


In [14]:
# Display class distribution
print("Class Distribution:\n", df['class'].value_counts())

Class Distribution:
 class
1    357
0    212
Name: count, dtype: int64


In [15]:
# Evaluate the model
accuracy = accuracy_score(ytest, predictions)
conf_matrix = confusion_matrix(ytest, predictions)

In [16]:
print("Accuracy Score:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

Accuracy Score: 0.9473684210526315
Confusion Matrix:
 [[ 57   6]
 [  3 105]]


---

##**Conclusion:**

The Logistic Regression model achieved an accuracy score of approximately 94.7%, indicating strong performance in predicting the target variable.

The confusion matrix shows that the model correctly classified 105 instances of class 1 and 57 instances of class 0, with a few misclassifications:

6 instances of class 0 were incorrectly classified as class 1,

And 3 instances of class 1 were misclassified as class 0.

Overall, the model demonstrates high accuracy and reliability in distinguishing between the two classes.

---