In [2]:
import os
path = '/content/drive/MyDrive/CIS520 Machine Learning 2020FALL/project'
os.chdir(path)
os.listdir(path)

['lr_baseline.ipynb',
 'model_comparison.ipynb',
 'data.csv',
 'data_preview.ipynb',
 'knn_baseline.ipynb']

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

### Prepare datasets

In [4]:
def prepare_data():
  data = pd.read_csv('data.csv')

  labelencoder_y = LabelEncoder()
  data['diagnosis'] = labelencoder_y.fit_transform(data['diagnosis'])

  df_output = data.dropna(axis='columns')

  return df_output

In [5]:
data = prepare_data()

In [6]:
def prepare_datasets(df_input):
  feature_selected = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

  X = df_input[feature_selected]
  y = df_input['diagnosis']

  return X, y

In [7]:
X, y = prepare_datasets(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("training data points: {}".format(len(y_train)))
print("testing data points: {}".format(len(y_test)))

training data points: 398
testing data points: 171


### Cross-validation

In [8]:
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


def lr_cross_validation():
  avg_accs = [] # accuracy
  max_accs = []
  min_accs = []
  avg_recs = [] # recall
  max_recs = []
  min_recs = []

  for item in solvers:
    clf = LogisticRegression(penalty='l2', solver=item, max_iter=5000)

    accs = cross_val_score(clf, X, y, cv=5)
    recs = cross_val_score(clf, X, y, cv=5, scoring='recall')
    avg_accs.append(np.average(accs))
    max_accs.append(np.max(accs))
    min_accs.append(np.min(accs))
    avg_recs.append(np.average(recs))
    max_recs.append(np.max(recs))
    min_recs.append(np.min(recs))

  return avg_accs, max_accs, min_accs, avg_recs, max_recs, min_recs


avg_accs, max_accs, min_accs, avg_recs, max_recs, min_recs = lr_cross_validation()

In [9]:
cv_result = {
    'solver':solvers,
    'avg accuracy':avg_accs,
    'max accuracy':max_accs,
    'min accuracy':min_accs,
    'avg recall':avg_recs,
    'max recall':max_recs,
    'min recall':min_recs
}

result_frame = pd.DataFrame(cv_result)
result_frame

Unnamed: 0,solver,avg accuracy,max accuracy,min accuracy,avg recall,max recall,min recall
0,newton-cg,0.906862,0.929825,0.859649,0.849945,0.952381,0.72093
1,lbfgs,0.908632,0.929825,0.859649,0.845183,0.928571,0.72093
2,liblinear,0.903369,0.938596,0.868421,0.812292,0.928571,0.674419
3,sag,0.875283,0.921053,0.833333,0.751163,0.880952,0.604651
4,saga,0.877022,0.929825,0.842105,0.751163,0.880952,0.604651


### Logistic regression baseline

In [None]:
# this function calculates the accuracy
def get_lr_accuracy(X_train, y_train, X_test, y_test):
  clf = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=2000)
  clf.fit(X_train, y_train)
  score = clf.score(X_test, y_test)

  return score

In [None]:
# this function calculates the recall for Logistic Regression model
def get_lr_recall(X_train, y_train, X_test, y_test):
  clf = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=2000)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  recall = recall_score(y_test, y_pred)

  return recall

In [None]:
acc = get_lr_accuracy(X_train, y_train, X_test, y_test)
print('Logistic regression baseline accuracy:', acc)

Logistic regression baseline accuracy: 0.935672514619883


In [None]:
rec = get_lr_recall(X_train, y_train, X_test, y_test)
print('Logistic regression baseline recall:', rec)

Logistic regression baseline recall: 0.9365079365079365


In [None]:
def lr_report():
  clf = LogisticRegression(penalty='l2', solver='newton-cg', max_iter=2000)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred))


lr_report()

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       108
           1       0.89      0.94      0.91        63

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.93       171
weighted avg       0.94      0.94      0.94       171

