# Classification with Scikit-Learn

### Regression on random generated dataset

#### Import Libraries

In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

#### Create Classification dataset

In [2]:
X, y = datasets.make_classification(
                        n_samples=1000,
                        n_features=10,
                        n_informative=8,
                        n_classes=3,
                        random_state=43)

In [3]:
X = pd.DataFrame(X)
X.columns = ['X1', 'X2', 'X3', 'X4', 'X5','X6', 'X7', 'X8', 'X9', 'X10']
X

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10
0,-1.914406,0.233732,-1.807928,-0.512153,1.398262,0.381182,-2.767646,2.444344,-0.630516,1.636387
1,0.759247,1.397979,0.415965,0.856484,0.546020,1.820276,1.580028,-2.299039,2.975507,-3.888826
2,-2.369630,-0.567535,-0.004316,-2.409063,1.322688,-1.510873,1.193471,3.222147,2.079678,-2.971841
3,-1.133787,0.335217,0.737784,0.773974,1.880210,-2.974308,5.142760,-1.441096,-0.614864,-1.855610
4,3.232324,1.110994,-0.547249,-1.860551,-2.314942,4.292691,-6.929045,0.860951,0.699968,1.157313
...,...,...,...,...,...,...,...,...,...,...
995,-0.529220,0.905847,0.919983,1.431735,-0.543803,0.850856,1.433223,-0.577727,-0.267308,0.604294
996,1.461870,-1.853597,1.611229,-1.259840,0.257795,-0.869855,-0.077028,0.229175,1.979296,-0.221801
997,-0.564122,2.729894,1.829650,0.437344,1.450382,1.640369,1.895507,1.108648,3.767926,-1.765008
998,1.824179,-2.367409,0.445785,-0.176667,0.124945,-0.963625,0.825621,-2.904529,0.956421,-1.970945


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      1000 non-null   float64
 1   X2      1000 non-null   float64
 2   X3      1000 non-null   float64
 3   X4      1000 non-null   float64
 4   X5      1000 non-null   float64
 5   X6      1000 non-null   float64
 6   X7      1000 non-null   float64
 7   X8      1000 non-null   float64
 8   X9      1000 non-null   float64
 9   X10     1000 non-null   float64
dtypes: float64(10)
memory usage: 78.2 KB


#### Split dataset into training dataset and testing dataset

In [5]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

#### Classification Models

In [6]:
lr = LogisticRegression()
ada = AdaBoostClassifier()
rfc = RandomForestClassifier()
gnb = GaussianNB()
svc = SVC()
dtc = DecisionTreeClassifier()

#### Train Model on training dataset

In [7]:
models = [lr, ada, rfc, gnb, svc, dtc]
for model in models:
    model.fit(train_X,train_y)

#### Prediction of testing dataset

In [8]:
def predict(X, y, model):
    pred = model.predict(X)
    print(classification_report(y, pred))

In [9]:
for model in models:
    print(model)
    predict(test_X, test_y, model)

LogisticRegression()
              precision    recall  f1-score   support

           0       0.74      0.70      0.72        87
           1       0.69      0.77      0.73        75
           2       0.87      0.83      0.85        88

    accuracy                           0.77       250
   macro avg       0.77      0.77      0.77       250
weighted avg       0.77      0.77      0.77       250

AdaBoostClassifier()
              precision    recall  f1-score   support

           0       0.68      0.78      0.73        87
           1       0.69      0.71      0.70        75
           2       0.89      0.74      0.81        88

    accuracy                           0.74       250
   macro avg       0.75      0.74      0.74       250
weighted avg       0.76      0.74      0.75       250

RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.85      0.87      0.86        87
           1       0.88      0.85      0.86        75
        

### Classification on Breast Cancer Dataset

#### Load dataset 

In [10]:
data = datasets.load_breast_cancer()

In [11]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [12]:
X = pd.DataFrame(data.data, columns=data.feature_names)

In [13]:
y = data.target

In [14]:
X.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64

In [15]:
X.shape

(569, 30)

#### Split dataset

In [16]:
train_X, test_X, train_y, test_y = train_test_split(X, y)

#### Classification Models

In [17]:
lr = LogisticRegression(max_iter=3000)
ada = AdaBoostClassifier()
rfc = RandomForestClassifier()
gnb = GaussianNB()
svc = SVC()
dtc = DecisionTreeClassifier()

#### Train Model on training dataset

In [18]:
models = [lr, ada, rfc, gnb, svc, dtc]
for model in models:
    model.fit(train_X,train_y)

#### Prediction of testing dataset

In [19]:
for model in models:
    print(model)
    predict(test_X, test_y, model)

LogisticRegression(max_iter=3000)
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        50
           1       0.98      0.98      0.98        93

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143

AdaBoostClassifier()
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        50
           1       0.99      0.99      0.99        93

    accuracy                           0.99       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.99      0.99      0.99       143

RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        50
           1       1.00      0.99      0.99        93

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99     