# Logistic Regression

## Importing the libraries

In [38]:
import pandas as pd
import numpy as np
from pprint import pprint

## Importing the dataset

In [39]:
dataset = pd.read_csv('../Dataset/breast_cancer_wisconsin.csv')

In [40]:
print("Dataset Information");
print("__________________________")
pprint(dataset.info());
print("\n")
print("Number of null data:\n")
print("__________________________")
pprint(pd.isnull(dataset).sum())

Dataset Information
__________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Clump_thickness              699 non-null    int64  
 1   Uniformity_of_cell_size      699 non-null    int64  
 2   Uniformity_of_cell_shape     699 non-null    int64  
 3   Marginal_adhesion            699 non-null    int64  
 4   Single_epithelial_cell_size  699 non-null    int64  
 5   Bare_nuclei                  683 non-null    float64
 6   Bland_chromatin              699 non-null    int64  
 7   Normal_nucleoli              699 non-null    int64  
 8   Mitoses                      699 non-null    int64  
 9   Class                        699 non-null    int64  
dtypes: float64(1), int64(9)
memory usage: 54.7 KB
None


Number of null data:

__________________________
Clump_thickness                 0
Uniformity_

Proccess null value:

In [47]:
threshold = 0.05;

if dataset.isna().sum().sum() / dataset.size < threshold:
    dataset = dataset.dropna()
else:
    for col in dataset.columns:
        if dataset[col].dtype in ['float64', 'int64']:
            # Điền NaN bằng mean cho dữ liệu số
            dataset[col] = dataset[col].fillna(dataset[col].mean())
        else:
            # Điền NaN bằng giá trị phổ biến nhất cho dữ liệu dạng object/categorical
            dataset[col] = dataset[col].fillna(dataset[col].mode()[0])

print("Number of null data:\n")
print("__________________________")
pprint(pd.isnull(dataset).sum())

Number of null data:

__________________________
Clump_thickness                0
Uniformity_of_cell_size        0
Uniformity_of_cell_shape       0
Marginal_adhesion              0
Single_epithelial_cell_size    0
Bare_nuclei                    0
Bland_chromatin                0
Normal_nucleoli                0
Mitoses                        0
Class                          0
dtype: int64


## Declare features and dependant variables

On the features, remove the "Sample code number" because it is not relevant to the prediction

In [46]:
X = dataset.iloc[:,1:-1].values;
pprint(X)


array([[ 1.,  1.,  1., ...,  3.,  1.,  1.],
       [ 4.,  4.,  5., ...,  3.,  2.,  1.],
       [ 1.,  1.,  1., ...,  3.,  1.,  1.],
       ...,
       [10., 10.,  3., ...,  8., 10.,  2.],
       [ 8.,  6.,  4., ..., 10.,  6.,  1.],
       [ 8.,  8.,  5., ..., 10.,  4.,  1.]])


In [45]:
y = dataset.iloc[:, -1].values;
pprint(y);

array([2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4,
       2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4,
       4, 2, 4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4,
       2, 4, 4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 4, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2,
       4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2,
       2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4,
       2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2,
       2, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2,
       2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4,
       4, 2, 4, 4, 4, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 4,
       4, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 4, 4,

## Splitting the dataset into the Training set and Test set

In [44]:
from sklearn.model_selection import train_test_split;

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0);

print("DataSet Splitting:\n");
print("_______________________________")
print("X_train: ", np.size(X_train));
print("X_test: ", np.size(X_test));
print("y_train:", np.size(y_train));
print("y_test", np.size(y_test));


DataSet Splitting:

_______________________________
X_train:  4368
X_test:  1096
y_train: 546
y_test 137


## Training the Logistic Regression model on the Training set

In [34]:
from sklearn.linear_model import LogisticRegression;
classifier = LogisticRegression(random_state=0);
classifier.fit(X_train, y_train);

## Predicting the Test set results

In [42]:
y_pred = classifier.predict(X_test);
pprint(y_pred)

array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2, 2,
       4, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2,
       2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 4,
       4, 2, 4, 2, 4, 4, 2, 4, 4, 4, 4, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 2,
       4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 4, 2, 2,
       4, 2, 2, 4, 2], dtype=int64)


## Making the Confusion Matrix

In [41]:
from sklearn.metrics import confusion_matrix;

cm = confusion_matrix(y_test, y_pred);
pprint(cm)

array([[83,  4],
       [ 3, 47]], dtype=int64)


## Computing the accuracy with k-Fold Cross Validation

In [56]:
from sklearn.model_selection import cross_val_score;

accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10);
print("Accuracies:  ", format(accuracies.mean() * 100), " %")
print("Standard Deviation:  ", format(accuracies.std() * 100), " %")

Accuracies:   96.6969696969697  %
Standard Deviation:   2.4347159649944254  %
