In [1]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import svm # support vector machine (see webpage)
from sklearn import linear_model # logistic regression
import ast

Data from https://archive.ics.uci.edu/dataset/365/polish+companies+bankruptcy+data

In [2]:
f = open('5year.arff', 'r')

In [3]:
while not '@data' in f.readline():
    pass

In [4]:
dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [5]:
dataset[0]

[1,
 0.088238,
 0.55472,
 0.01134,
 1.0205,
 -66.52,
 0.34204,
 0.10949,
 0.57752,
 1.0881,
 0.32036,
 0.10949,
 0.1976,
 0.096885,
 0.10949,
 1475.2,
 0.24742,
 1.8027,
 0.10949,
 0.077287,
 50.199,
 1.1574,
 0.13523,
 0.062287,
 0.41949,
 0.32036,
 0.20912,
 1.0387,
 0.026093,
 6.1267,
 0.37788,
 0.077287,
 155.33,
 2.3498,
 0.24377,
 0.13523,
 1.4493,
 571.37,
 0.32101,
 0.095457,
 0.12879,
 0.11189,
 0.095457,
 127.3,
 77.096,
 0.45289,
 0.66883,
 54.621,
 0.10746,
 0.075859,
 1.0193,
 0.55407,
 0.42557,
 0.73717,
 0.73866,
 15182.0,
 0.080955,
 0.27543,
 0.91905,
 0.002024,
 7.2711,
 4.7343,
 142.76,
 2.5568,
 3.2597,
 False]

Data setup

In [6]:
X = [x[:-1] for x in dataset]
y = [x[-1] for x in dataset]

Fit Model

In [7]:
mod = linear_model.LogisticRegression(C=1.0) # C=1.0 tells accuracy and complexity equally important

In [8]:
mod.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [9]:
pred = mod.predict(X)

In [10]:
pred

array([False, False, False, ..., False, False, False])

In [11]:
correct = pred == y

In [12]:
correct

array([ True,  True,  True, ..., False, False, False])

In [13]:
acc = sum(correct) / len(correct)

In [14]:
acc

0.9656878917848895

In [15]:
sum(y), len(y) # Nearly everything labeled False

(102, 3031)

In [16]:
1 - 102 / 3031 # predict all false

0.9663477400197954

Balanced model

In [17]:
mod = linear_model.LogisticRegression(C=1.0, class_weight='balanced')

In [18]:
mod.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')

In [19]:
pred = mod.predict(X)

In [20]:
sum(pred == y) / len(y)

0.6958099637083471

Train/validation/test splits

In [21]:
random.shuffle(dataset)

In [22]:
X = [d[:-1] for d in dataset]

In [23]:
y = [d[-1] for d in dataset]

In [24]:
N = len(y)

In [25]:
N

3031

In [26]:
Ntrain = 1000
Nvalid = 1000
Ntest = 1031

In [27]:
Xtrain = X[:Ntrain]
Xvalid = X[Ntrain:Ntrain+Nvalid]
Xtest = X[Ntrain+Nvalid:]

In [28]:
ytrain = y[:Ntrain]
yvalid = y[Ntrain:Ntrain+Nvalid]
ytest = y[Ntrain+Nvalid:]

In [29]:
mod.fit(Xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')

TP, TN, FP, FN, Accuracy, BER

In [30]:
pred = mod.predict(Xtest)

In [31]:
correct = pred == ytest

In [32]:
accuracy = sum(correct) / len(correct)

In [33]:
accuracy

0.7322987390882638

In [34]:
pred = pred.tolist()

In [35]:
pred

[False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 Fals

In [36]:
TP_ = [(a and b) for (a,b) in zip(pred, ytest)]
FP_ = [(a and not b) for (a,b) in zip(pred, ytest)]
TN_ = [(not a and not b) for (a,b) in zip(pred, ytest)]
FN_ = [(not a and b) for (a,b) in zip(pred, ytest)]

In [37]:
TP = sum(TP_)
FP = sum(FP_)
TN = sum(TN_)
FN = sum(FN_)

In [40]:
# accuracy
(TP + TN) / (TP + TN + FP + FN)

0.7322987390882638

In [41]:
# rates
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

In [42]:
# BER
BER = 1 - 0.5 * (TPR + TNR)

In [44]:
BER # 0.5 is random, it should be smaller than that

0.2662546462918166

Ranking

In [48]:
scores = mod.decision_function(Xtest)

In [49]:
scores

array([-0.52889809,  0.22401715, -0.59998249, ..., -2.07648361,
       -0.34956207, -0.75487542])

In [50]:
scoreslabels = list(zip(scores, ytest))

In [51]:
scoreslabels

[(-0.5288980925607862, False),
 (0.22401715362914243, False),
 (-0.5999824899466915, False),
 (-0.310430361571051, False),
 (-0.40786500136980175, False),
 (-1.177889635947298, False),
 (-0.4875108809808856, False),
 (-0.8880758652727172, False),
 (-0.3153534995673902, False),
 (-7.556237116778027, False),
 (-0.26176955562689086, False),
 (0.398294161250038, False),
 (0.6873459802564487, False),
 (1.2743709280263573, False),
 (-0.4030582279291609, False),
 (0.3993716319620637, False),
 (-0.23640219500313508, False),
 (0.2807129573408438, False),
 (-0.9141096520517504, False),
 (-0.797750345421659, False),
 (-0.28825899122810594, False),
 (-3.3894945083490375, False),
 (0.25223264206124274, False),
 (-0.0172370359788268, False),
 (-0.20496024303382243, False),
 (0.09249596070622627, True),
 (0.07218457239257937, False),
 (-0.8359994656214422, False),
 (-0.8317827027594228, False),
 (-0.6052015641380775, False),
 (-0.7892823629228395, False),
 (-0.8707082563663253, False),
 (-0.028383239

In [52]:
scoreslabels.sort(reverse=True)

In [53]:
scoreslabels

[(86.97689088605874, False),
 (26.018834167577598, False),
 (8.377056362089254, False),
 (7.761503443859105, False),
 (6.49123522981434, False),
 (6.302768521036462, True),
 (6.176168805328418, False),
 (5.8430890923478565, True),
 (5.231646311186491, True),
 (5.193198382762205, True),
 (4.022437823669628, False),
 (3.857152553670157, True),
 (3.241425899949835, False),
 (3.203219590859299, False),
 (3.1653228002725253, False),
 (3.0276539671841323, False),
 (2.8795786429012513, False),
 (2.6503249807901605, False),
 (2.6386030360763195, False),
 (2.618203890444845, True),
 (2.6089758357598187, False),
 (2.509982377119168, False),
 (2.3300736148476746, False),
 (2.317085478377373, False),
 (2.224832578930217, True),
 (2.2023845040892747, False),
 (2.192314632356778, False),
 (2.0293051974533327, True),
 (2.027334114932032, False),
 (2.027126650657382, False),
 (1.9971276327872727, False),
 (1.9770244898987799, False),
 (1.8183656010579585, True),
 (1.792036483305315, False),
 (1.748693

In [54]:
sortedlabels = [x[1] for x in scoreslabels]

In [55]:
sortedlabels

[False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,

In [56]:
# precision
retrieved = sum(pred)
relevant = sum(ytest)
intersection = TP

In [57]:
intersection / retrieved

0.08561643835616438

In [58]:
# recall
intersection / relevant

0.7352941176470589

In [59]:
# precision at 10
sum(sortedlabels[:10]) / 10

0.4