In [13]:
from pandas import read_csv
from sklearn.model_selection import train_test_split, cross
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
def load_dataset(filename):
    data = read_csv(filename, header=None)
    dataset = data.values
    X, y = dataset[:, :-1], dataset[:, -1]
    X = X.astype(str)
    return X,y

In [15]:
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

In [16]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [17]:
def select_features(X_train_enc, y_train_enc, X_test_enc):
    fs = SelectKBest(score_func=mutual_info_classif, k=4)
    fs.fit(X_train_enc, y_train_enc)
    X_train_fs = fs.transform(X_train_enc)
    X_test_fs = fs.transform(X_test_enc)
    return X_train_fs, X_test_fs

In [18]:
X, y = load_dataset('breast-cancer.csv')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [20]:
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)

In [21]:
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [22]:
X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc)

In [25]:
model = LogisticRegression(solver='lbfgs')

In [26]:
model.fit(X_train_fs, y_train_enc)

LogisticRegression()

In [28]:
yhat = model.predict(X_test_fs)

In [29]:
accuracy = accuracy_score(y_test_enc, yhat)

In [30]:
print('Accuracy: %.2f' % (accuracy * 100))

Accuracy: 73.68
