### 1. Import modules

In [65]:
# import libraries
import pandas as pd
import numpy as np

# import scikitlearn for splitting data to train and test data, and data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# import classifier for modelling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# import evaluates model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

# import sample data provided by sklearn
from sklearn.datasets import load_breast_cancer

### 2. Load, Preprocessing and Preview Data

In [41]:
# load data from scikit learn datasets and preview data in dataframes
data_bc = load_breast_cancer()
df = pd.DataFrame(data_bc.data, columns=data_bc.feature_names)
df['target'] = data_bc.target

# Display the first few rows
print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

### 3. Train-Test Split

In [42]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('target', axis=1), df['target'],
    test_size=0.25, random_state=42)
X_train

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
287,12.890,13.12,81.89,515.9,0.06955,0.03729,0.02260,0.01171,0.1337,0.05581,...,13.620,15.54,87.40,577.0,0.09616,0.11470,0.11860,0.05366,0.2309,0.06915
512,13.400,20.52,88.64,556.7,0.11060,0.14690,0.14450,0.08172,0.2116,0.07325,...,16.410,29.66,113.30,844.4,0.15740,0.38560,0.51060,0.20510,0.3585,0.11090
402,12.960,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,...,14.130,24.61,96.31,621.9,0.09329,0.23180,0.16040,0.06608,0.3207,0.07247
446,17.750,28.03,117.30,981.6,0.09997,0.13140,0.16980,0.08293,0.1713,0.05916,...,21.530,38.54,145.40,1437.0,0.14010,0.37620,0.63990,0.19700,0.2972,0.09075
210,20.580,22.14,134.70,1290.0,0.09090,0.13480,0.16400,0.09561,0.1765,0.05024,...,23.240,27.84,158.30,1656.0,0.11780,0.29200,0.38610,0.19200,0.2909,0.05865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,8.888,14.64,58.79,244.0,0.09783,0.15310,0.08606,0.02872,0.1902,0.08980,...,9.733,15.67,62.56,284.4,0.12070,0.24360,0.14340,0.04786,0.2254,0.10840
106,11.640,18.33,75.17,412.5,0.11420,0.10170,0.07070,0.03485,0.1801,0.06520,...,13.140,29.26,85.51,521.7,0.16880,0.26600,0.28730,0.12180,0.2806,0.09097
270,14.290,16.82,90.30,632.6,0.06429,0.02675,0.00725,0.00625,0.1508,0.05376,...,14.910,20.65,94.44,684.6,0.08567,0.05036,0.03866,0.03333,0.2458,0.06120
435,13.980,19.62,91.12,599.5,0.10600,0.11330,0.11260,0.06463,0.1669,0.06544,...,17.040,30.80,113.90,869.3,0.16130,0.35680,0.40690,0.18270,0.3179,0.10550


### 4. Standardization

In [46]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 5. Initialize, train and evaluate different models

  #### Linear Regression

In [63]:
# Fit the linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
# Make predictions
lin_reg_y_pred = lin_reg.predict(X_test_scaled)

In [66]:
# Evaluate the model
print('Mean Squared Error:', mean_squared_error(y_test, lin_reg_y_pred))
print('R^2 Score:', r2_score(y_test, lin_reg_y_pred))

Mean Squared Error: 0.06289049307232394
R^2 Score: 0.7324078874665101


#### Logistic Regression


In [52]:
# Fit the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
# Make predictions
log_reg_y_pred = log_reg.predict(X_test_scaled)

In [54]:
# Evaluate the model
print(accuracy_score(log_reg_y_pred, y_test))
print(confusion_matrix(y_test, log_reg_y_pred))
print(classification_report(y_test, log_reg_y_pred))

0.9790209790209791
[[53  1]
 [ 2 87]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        54
           1       0.99      0.98      0.98        89

    accuracy                           0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143



#### Decision Tree Classifier

In [57]:
# Fit the linear regression model
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
# Make predictions
tree_clf_y_pred = tree_clf.predict(X_test)

In [58]:
# Evaluate the model
print(accuracy_score(tree_clf_y_pred, y_test))
print(confusion_matrix(y_test, tree_clf_y_pred))
print(classification_report(y_test, tree_clf_y_pred))

0.9370629370629371
[[51  3]
 [ 6 83]]
              precision    recall  f1-score   support

           0       0.89      0.94      0.92        54
           1       0.97      0.93      0.95        89

    accuracy                           0.94       143
   macro avg       0.93      0.94      0.93       143
weighted avg       0.94      0.94      0.94       143



#### Random Forest Classifier

In [59]:
# Random Forest Classifier
forest_clf = RandomForestClassifier()
forest_clf.fit(X_train, y_train)
forest_clf_y_pred = forest_clf.predict(X_test)

In [60]:
# Evaluate the model
print(accuracy_score(forest_clf_y_pred, y_test))
print(confusion_matrix(y_test, forest_clf_y_pred))
print(classification_report(y_test, forest_clf_y_pred))

0.972027972027972
[[51  3]
 [ 1 88]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        54
           1       0.97      0.99      0.98        89

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143

