In [2]:
import pandas as pd
import matplotlib as plt
import seaborn as sns

df = pd.read_csv('breastcancerdf.csv')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

x = df.iloc[:, :-2]
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

lr = LogisticRegression()
lr.fit(x_train, y_train)

y_predict = lr.predict(x_test)
print(classification_report(y_test, y_predict))

from sklearn.metrics import accuracy_score

print(f"Model Accuracy: {accuracy_score(y_test, y_predict) * 100:.3f} %")
print(confusion_matrix(y_test, y_predict))

svm = SVC()
svm.fit(x_train, y_train)
y2_predict = svm.predict(x_test)

print(classification_report(y_test, y2_predict))
print(confusion_matrix(y_test, y2_predict))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Model Accuracy: 98.246 %
[[41  1]
 [ 1 71]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

[[41  1]
 [ 1 71]]


In [5]:
lr = LogisticRegression(penalty = 'l1', solver = 'liblinear', C = 1, max_iter = 1000, tol = 0.0001)
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.99      1.00      0.99        72

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

[[41  1]
 [ 0 72]]


In [6]:
svm = SVC(kernel = 'linear')
svm.fit(x_train, y_train)
y2_predict = svm.predict(x_test)

print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.99      1.00      0.99        72

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

[[41  1]
 [ 0 72]]


##
### Hyperparameter Tuning
##

* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
* Hyperparameter tuning is a process of finding the best hyperparameters for the model.
* Hyperparameter are the parameters that are set before the model is trained.
* Important hyperparameter for logistic regression

    - C
    * penalty
    * solver
    * max_iter
    * tol
    
- Important hyperparameters for SVM

   * C
   * kernel
   * degree
   * gamma
   * max_iter

###
## Grid Search
It takes the set of hyperparameter values as input and return the best hyperparameter values.

In [74]:
from sklearn.model_selection import GridSearchCV

params = {'C' : [0.1, 1, 10], 'penalty' : ['l1', 'l2'], 'solver': ['liblinear', 'saga', 'lbfgs'],
          'max_iter':[100, 1000, 10000], 'tol' : [0.001, 0.0001]}

grid = GridSearchCV(LogisticRegression(), param_grid = params, cv = 5, scoring = 'accuracy')
grid.fit(x_train, y_train)




90 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [75]:
best_model = grid.best_estimator_
best_params = grid.best_params_
best_score = grid.best_score_

In [76]:
best_model

In [77]:
best_params

{'C': 0.1,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'liblinear',
 'tol': 0.001}

In [79]:
model = LogisticRegression(**best_params)
model1 = LogisticRegression(C = 1, max_iter = 1000, penalty = 'l1', solver = 'liblinear', tol = 0.0001)
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

[[41  1]
 [ 1 71]]
