### Name: Weihan Feng
### UID:  605948377

In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.discrete.discrete_model import Logit
from imblearn.over_sampling import SMOTENC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, precision_recall_curve, mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, zero_one_loss
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [5]:
data = load_breast_cancer()


In [7]:
### (a) Standardize features
X = data.data
Y = data.target
print(X.shape, Y.shape)
### standard
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

(569, 30) (569,)


In [8]:
### (b) Kmeans
### full set of features
kmeans_1 = KMeans(n_clusters = 2, random_state = 42).fit(X_scaled)
y_pred_1 = kmeans_1.labels_
loss_1 = zero_one_loss(Y, y_pred_1)
print(f'Loss on full features: {loss_1}')
### first 2 principal features
pca = PCA(n_components=2)
X_scaled_pca = pca.fit_transform(X_scaled)
kmeans_2 = KMeans(n_clusters = 2, random_state = 42).fit(X_scaled_pca)
y_pred_2 = kmeans_2.labels_
loss_2 = zero_one_loss(Y, y_pred_2)
print(f'Loss on first 2 features: {loss_2}')


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Loss on full features: 0.9050966608084359
Loss on first 2 features: 0.09314586994727592


In [10]:
### (c) Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size = 0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(455, 30) (455,)
(114, 30) (114,)


In [11]:
### (d) Logistic regression
lr_model = LogisticRegression(random_state = 0).fit(X_train, y_train)
y_train_pred = lr_model.predict(X_train)
f1 = f1_score(y_train, y_train_pred)
print(f'F1 score by LR is {f1}')
mis_clf_rate = 1 - accuracy_score(y_train, y_train_pred)
print(f'Misclassification rate is {mis_clf_rate}')

F1 score by LR is 0.9897260273972602
Misclassification rate is 0.01318681318681314


In [12]:
### (e) Fit a Random Forest by GridSearch
depth_list = range(10,60,5)
f1_list = []
mis_list = []
rf = RandomForestClassifier(n_estimators = 100)
searching_space={'max_depth':depth_list}
gc = GridSearchCV(rf, param_grid = searching_space,cv = 5,scoring = 'f1')
model_grid = gc.fit(X_train, y_train)


In [13]:
best_param = model_grid.best_params_
best_model = model_grid.best_estimator_
print(best_param)
print(best_model)


{'max_depth': 40}
RandomForestClassifier(max_depth=40)


In [14]:
y_train_pred = best_model.predict(X_train)
f1_train = f1_score(y_train, y_train_pred)
mis_rate_train = 1 - accuracy_score(y_train, y_train_pred)
print(f'f1 score of train data: {f1_train}')
print(f'misclassification rate of train data: {mis_rate_train}')

y_test_pred = best_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)
mis_rate_test = 1 - accuracy_score(y_test, y_test_pred)
print(f'f1 score of test data: {f1_test}')
print(f'misclassification rate of test data: {mis_rate_test}')


f1 score of train data: 1.0
misclassification rate of train data: 0.0
f1 score of test data: 0.9774436090225564
misclassification rate of test data: 0.02631578947368418
