In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('./mushrooms.csv')

In [3]:
df = df.drop(columns=['veil-type'])
df = df.drop(columns=['odor'])
df = df.drop(columns=['cap-shape'])
df = df.drop(columns=['cap-surface'])
df = df.drop(columns=['cap-color'])
df = df.drop(columns=['gill-attachment'])
df = df.drop(columns=['stalk-root'])
df = df.drop(columns=['stalk-surface-above-ring'])
df = df.drop(columns=['stalk-surface-below-ring'])
df = df.drop(columns=['stalk-color-above-ring'])
df = df.drop(columns=['gill-color'])
df = df.drop(columns=['stalk-color-below-ring'])
df = df.drop(columns=['veil-color'])
df = df.drop(columns=['ring-type'])
df = df.drop(columns=['spore-print-color'])
df = df.drop(columns=['population'])
df = df.drop(columns=['habitat'])

In [4]:
cla = {'e': 0, 'p': 1}
bruises = {'t': 0, 'f': 1}
g_spacing = {'c': 0, 'w': 1, 'd': 2}
g_size = {'b': 0, 'n': 1}
s_shape = {'e': 0, 't': 1}
r_number = {'n': 0, 'o': 1, 't': 2}

for df_d in [df]:
    df_d['class'] = df_d['class'].map(cla)
    df_d['bruises'] = df_d['bruises'].map(bruises)
    df_d['gill-spacing'] = df_d['gill-spacing'].map(g_spacing)
    df_d['gill-size'] = df_d['gill-size'].map(g_size)
    df_d['stalk-shape'] = df_d['stalk-shape'].map(s_shape)
    df_d['ring-number'] = df_d['ring-number'].map(r_number)

In [5]:
df

Unnamed: 0,class,bruises,gill-spacing,gill-size,stalk-shape,ring-number
0,1,0,0,1,0,1
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,1,0,0,1,0,1
4,0,1,1,0,1,1
...,...,...,...,...,...,...
8119,0,1,0,0,0,1
8120,0,1,0,0,0,1
8121,0,1,0,0,0,1
8122,1,1,0,1,1,1


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('class', axis=1), df['class'], test_size=0.2, random_state=42)

In [7]:
X_test = X_test.to_numpy()
X_train = X_train.to_numpy()
Y_test = Y_test.to_numpy()
Y_train = Y_train.to_numpy()

### Метрики

In [8]:
def mymetrics(matrix):
    tp = matrix.loc['predicted_1', 'actual_1']
    fp = matrix.loc['predicted_1', 'actual_0']
    fn = matrix.loc['predicted_0', 'actual_1']
    tn = matrix.loc['predicted_0', 'actual_0']

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return accuracy, precision, recall

def confusion_matrix(y_pred, y_test):
    matrix = pd.DataFrame({'actual_1' : [0, 0], 'actual_0': [0, 0]})
    matrix.index = ['predicted_1', 'predicted_0']

    for i in range(len(y_pred)):
        if y_pred[i] == 1 and y_test[i] == 1:
            matrix.loc['predicted_1', 'actual_1'] += 1
        elif y_pred[i] == 1 and y_test[i] == 0:
            matrix.loc['predicted_1', 'actual_0'] += 1
        elif y_pred[i] == 0 and y_test[i] == 1:
            matrix.loc['predicted_0', 'actual_1'] += 1
        else:
            matrix.loc['predicted_0', 'actual_0'] += 1
    return matrix

### Логистическая регрессия

In [9]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

class LogisticRegressor:
  def __init__(self):
    self.loss = []

  def count_loss(self, x, y):
    z = np.dot(x, self.weights)
    predicted_true = y * np.log(sigmoid(z))
    predicted_false = (1 - y) * np.log(1 - sigmoid(z))
    return -np.sum(predicted_true + predicted_false) / x.shape[0]

  def fit(self, x, y, nepoch = 1000, lr = 0.01, log_loss = False):
    self.weights = np.random.rand(x.shape[1])
    for _ in range(nepoch):
      predicted = sigmoid(np.dot(x, self.weights))
      self.weights -= lr * np.dot(x.T, predicted - y) / x.shape[0]
      if log_loss:
        self.loss.append(self.count_loss(x, y))

  def predict(self, x):
    prediction = np.dot(x, self.weights)
    return [1 if predict > 0.5 else 0 for predict in sigmoid(prediction)]

In [10]:
lgr = LogisticRegressor()
lgr.fit(X_train, Y_train)
predicted = lgr.predict(X_test)
accuracy_score(predicted, Y_test)

0.8843076923076924

In [11]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
predicted = lr.predict(X_test)
accuracy_score(predicted, Y_test)

0.8947692307692308

In [12]:
hyperparameters = [
(0.1, 500), 
(0.1, 1000), 
(0.1, 1500),
(0.01, 5000),
(0.01, 10000),
(0.001, 5000),
(0.001, 10000),
(0.001, 15000)
]

for pair in hyperparameters:
    lr, nepoch = pair
    lgr = LogisticRegressor()
    lgr.fit(X_train, Y_train, nepoch = nepoch, lr = lr)
    predicted_test = lgr.predict(X_test)
    predicted_train = lgr.predict(X_train)
    print(f'Learning rate = {lr}, nepoch = {nepoch}, train accuracy = {accuracy_score(Y_train, predicted_train)}, test accuracy = {accuracy_score(Y_test, predicted_test)}')

Learning rate = 0.1, nepoch = 500, train accuracy = 0.909216802585013, test accuracy = 0.9003076923076923
Learning rate = 0.1, nepoch = 1000, train accuracy = 0.9072164948453608, test accuracy = 0.8984615384615384
Learning rate = 0.1, nepoch = 1500, train accuracy = 0.9072164948453608, test accuracy = 0.8984615384615384
Learning rate = 0.01, nepoch = 5000, train accuracy = 0.909216802585013, test accuracy = 0.9003076923076923
Learning rate = 0.01, nepoch = 10000, train accuracy = 0.909216802585013, test accuracy = 0.9003076923076923
Learning rate = 0.001, nepoch = 5000, train accuracy = 0.7688875211571011, test accuracy = 0.7575384615384615
Learning rate = 0.001, nepoch = 10000, train accuracy = 0.9102938913679027, test accuracy = 0.9009230769230769
Learning rate = 0.001, nepoch = 15000, train accuracy = 0.9102938913679027, test accuracy = 0.9009230769230769


### Самописные метрики

In [13]:
logit = LogisticRegressor()
logit.fit(X_train, Y_train, nepoch = 5000, lr = 0.01)
Y_pred = logit.predict(X_test)

In [14]:
met = mymetrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {met[0]}\nPrecision: {met[1]}\nRecall: {met[2]}')

Accuracy: 0.9003076923076923
Precision: 0.8904282115869018
Recall: 0.9040920716112532


In [15]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,707,87
predicted_0,75,756


### Метрики из sklearn

In [24]:
model = LogisticRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [27]:
met = mymetrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {met[0]}\nPrecision: {met[1]}\nRecall: {met[2]}')

Accuracy: 0.8947692307692308
Precision: 0.8983050847457628
Recall: 0.881074168797954


In [28]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,689,78
predicted_0,93,765


### Дерево решений

In [29]:
class Node:
  def __init__(self, index = None, value = None, left = None, right = None, gain = 0, predicted = None):
    self.index = index
    self.value = value
    self.left = left
    self.right = right
    self.gain = gain
    self.predicted = predicted


class ClassifyingDecisionTree:
  def __init__(self, max_depth = 2, min_samples_split = 2):
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split

  def fit(self, x, y):
    data = np.concatenate((x, y), axis=1)
    self.root = self.build_tree(data)

  def predict(self, x):
    predicted = [self.single_prediction(obs, self.root) for obs in x]
    return predicted

  def single_prediction(self, x, node):
    if node.predicted != None:
      return node.predicted
    if x[node.index] <= node.value:
      return self.single_prediction(x, node.left)
    else:
      return self.single_prediction(x, node.right)

  def gini(self, rows):
    unique = np.unique(rows)
    impurity = 1
    for cls in unique:
      prob = len(rows[rows == cls]) / float(len(rows))
      impurity -= prob ** 2
    return impurity

  def gain(self, current_uncertainty, left, right):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * self.gini(left) - (1 - p) * self.gini(right)

  def split(self, data, index, value):
    left = np.array([item for item in data if item[index] <= value])
    right = np.array([item for item in data if item[index] > value])
    return left, right

  def find_split(self, data, num_features):
    split = Node()
    current_uncertainty = self.gini(data[:, -1])
    for index in range(num_features):
      values = data[:, index]
      for value in np.unique(values):
        left, right = self.split(data, index, value)
        if len(left) == 0 or len(right) == 0:
          continue
        gain = self.gain(current_uncertainty, left[:, -1], right[:, -1])
        if gain > split.gain:
          split.index = index
          split.value = value
          split.left = left
          split.right = right
          split.gain = gain
    return split

  def fit1(self, x, y):
    data = np.concatenate((x, y[:, np.newaxis]), axis=1)
    self.root = self.build_tree(data)

  def build_tree(self, data, current_depth = 0):
    X, Y = data[:, :-1], data[:, -1]
    num_samples, num_features = np.shape(X)
    if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
      split = self.find_split(data, num_features)
      if split.gain > 0:
        left = self.build_tree(split.left, current_depth + 1)
        right = self.build_tree(split.right, current_depth + 1)
        split.left = left
        split.right = right
        return split
    value = max(list(Y), key = list(Y).count)
    return Node(predicted = value)

In [30]:
depths = [1, 2, 3]
for depth in depths:
  crt = ClassifyingDecisionTree(max_depth = depth)
  crt.fit1(X_train, Y_train)
  predicted = crt.predict(X_test)
  print("Depth = {}, accuracy = {}".format(depth, accuracy_score(predicted, Y_test)))

Depth = 1, accuracy = 0.7532307692307693
Depth = 2, accuracy = 0.864
Depth = 3, accuracy = 0.9046153846153846


In [31]:
for depth in depths:
  sk_crt = DecisionTreeClassifier(max_depth = depth)
  sk_crt.fit(X_train, Y_train)
  predicted = sk_crt.predict(X_test)
  print("Depth = {}, accuracy = {}".format(depth, accuracy_score(predicted, Y_test)))

Depth = 1, accuracy = 0.7409230769230769
Depth = 2, accuracy = 0.7532307692307693
Depth = 3, accuracy = 0.864


### Самописные метрики

In [32]:
tree = ClassifyingDecisionTree(min_samples_split=1, max_depth=1)
tree.fit(pd.DataFrame(X_train), pd.DataFrame(Y_train))
Y_pred = tree.predict(X_test)

In [34]:
met = mymetrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {met[0]}\nPrecision: {met[1]}\nRecall: {met[2]}')

Accuracy: 0.7532307692307693
Precision: 0.9223946784922394
Recall: 0.5319693094629157


In [35]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,416,35
predicted_0,366,808


### Метрики из sklearn

In [36]:
model = DecisionTreeClassifier(min_samples_split=1., max_depth=1).fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [38]:
met = mymetrics(confusion_matrix(Y_pred, Y_test))
print(f'Accuracy: {met[0]}\nPrecision: {met[1]}\nRecall: {met[2]}')

Accuracy: 0.7409230769230769
Precision: 0.8546168958742633
Recall: 0.5562659846547314


In [39]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,435,74
predicted_0,347,769


### Random forest

In [40]:
from collections import Counter

class RandomForest:
  def __init__(self, num_trees = 10, max_depth = 2, min_samples_split = 2):
    self.num_trees = num_trees
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.forest = []

  def get_sample(self, x, y):
    random_rows = np.random.choice(a = x.shape[0], size = x.shape[0])
    return x[random_rows], y[random_rows]

  def fit(self, x, y):
    for _ in range(self.num_trees):
      sample_x, sample_y = self.get_sample(x, y)
      tree = ClassifyingDecisionTree(self.max_depth, self.min_samples_split)
      tree.fit1(sample_x, sample_y)
      self.forest.append(tree)

  def predict(self, x):
    prediction = []
    y = []
    for tree in self.forest:
      y.append(tree.predict(x))
    y = np.swapaxes(a = y, axis1 = 0, axis2 = 1)  
    for obs in y:
      counter = Counter(obs)
      prediction.append(counter.most_common(1)[0][0])
    return prediction

In [41]:
num_trees = [2, 5, 10]
for n in num_trees:
  forest = RandomForest(num_trees = n)
  forest.fit(X_train, Y_train)
  predicted = forest.predict(X_test)
  print("Number of trees = {}, accuracy = {}".format(n, accuracy_score(predicted, Y_test)))

Number of trees = 2, accuracy = 0.8990769230769231
Number of trees = 5, accuracy = 0.8990769230769231
Number of trees = 10, accuracy = 0.864


In [42]:
for n in num_trees:
  forest = RandomForestClassifier(n_estimators = n)
  forest.fit(X_train, Y_train)
  predicted = forest.predict(X_test)
  print("Number of trees = {}, accuracy = {}".format(n, accuracy_score(predicted, Y_test)))

Number of trees = 2, accuracy = 0.9046153846153846
Number of trees = 5, accuracy = 0.9046153846153846
Number of trees = 10, accuracy = 0.9046153846153846


### Самописные метрики

In [43]:
mytree = RandomForest(num_trees = 2)
mytree.fit(X_train, Y_train)
# Y_pred = mytree.predict(X_test)

In [44]:
met = mymetrics(confusion_matrix(mytree.predict(X_test), Y_test))
print(f'Accuracy: {met[0]}\nPrecision: {met[1]}\nRecall: {met[2]}')

Accuracy: 0.8990769230769231
Precision: 0.8872180451127819
Recall: 0.9053708439897699


In [45]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,435,74
predicted_0,347,769


### Метрики из sklearn

In [46]:
model = RandomForestClassifier(n_estimators=2)
model.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
met = mymetrics(confusion_matrix(model.predict(X_test), Y_test))
print(f'Accuracy: {met[0]}\nPrecision: {met[1]}\nRecall: {met[2]}')

Accuracy: 0.9046153846153846
Precision: 0.9288645690834473
Recall: 0.8682864450127877


In [49]:
confusion_matrix(Y_pred, Y_test)

Unnamed: 0,actual_1,actual_0
predicted_1,435,74
predicted_0,347,769
