**Dependancies**

In [41]:
import numpy as np 
import pandas as pd
import random

# Part 1

**Some EDA**

In [42]:
df = pd.read_csv('transfusion.data')
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [44]:
df['whether he/she donated blood in March 2007'].value_counts()

0    570
1    178
Name: whether he/she donated blood in March 2007, dtype: int64

From the above information, we can confirm that every column in our DataFrame has the numeric type. Also our target value has two classes!

Let's rename our 'whether he/she donated blood in March 2007' to 'donated'

In [45]:
df.rename(columns={'whether he/she donated blood in March 2007': 'donated'}, inplace=True)
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),donated
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


**Normalizing**

In [46]:
df.var().round(2)

Recency (months)              65.54
Frequency (times)             34.10
Monetary (c.c. blood)    2131094.23
Time (months)                594.22
donated                        0.18
dtype: float64

From the above information, we only need to normalize the values in 'c.c. blood' column, because we have a high variance there. We can take **log**

In [47]:
df['Monetary (c.c. blood)'] = np.log(df['Monetary (c.c. blood)'])
print(df.var().round(2))
df.head()

Recency (months)          65.54
Frequency (times)         34.10
Monetary (c.c. blood)      0.84
Time (months)            594.22
donated                    0.18
dtype: float64


Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),donated
0,2,50,9.433484,98,1
1,0,13,8.08641,28,1
2,1,16,8.29405,35,1
3,2,20,8.517193,45,1
4,1,24,8.699515,77,0


**Train/Test split**

In [48]:
labels = df['donated'].to_numpy()
df.drop(columns='donated', inplace=True)

In [49]:
# randomly choose 0.75 of the dataset size for train indices
train_indices = random.sample(range(0, len(df)), int(0.75 * len(df)))
test_indices  = list(set(range(0, len(df))) - set(train_indices))
df_np = df.to_numpy()
train_x = df_np[train_indices]
train_y = labels[train_indices].reshape(-1, 1)
test_x = df_np[test_indices]
test_y = labels[test_indices].reshape(-1, 1)
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)
print('test_x shape:', test_x.shape)
print('test_y shape:', test_y.shape)

train_x shape: (561, 4)
train_y shape: (561, 1)
test_x shape: (187, 4)
test_y shape: (187, 1)


## Logistic Regression Classifier

In [50]:
def sigmoid(z):
    return 1/ (1 + np.exp(-z))

def cost_function(theta, X, y):
    
    m = len(y)
    preds = sigmoid(np.dot(X, theta))
    error = (-y * np.log(preds)) - ((1-y) * np.log(1-preds))
    cost = 1/m * sum(error)
    gradient = 1/m * np.dot(X.transpose(), (preds - y))
    
    return gradient

def gradient_descent(X, y, theta, lr, iterations):
    
    m = len(y)
    
    for i in range(iterations):
        grad = cost_function(theta, X, y)
        theta = theta - (lr * grad)
    
    return theta 

def predict(theta, X):
    predictions = X.dot(theta)
    return predictions > 0

In [51]:
m1 , n1 = train_x.shape[0], train_x.shape[1]
X_train = np.append(np.ones((m1, 1)), train_x, axis=1)
y_train = train_y.reshape(m1, 1)
initial_theta = np.zeros((n1 + 1, 1))
final_theta = gradient_descent(X_train, y_train, theta=initial_theta, lr=0.001, iterations=10000)

**Accuracy**

In [52]:
# trainning accuracy
train_pred = predict(final_theta, X_train)
print("Train Accuracy:", sum(train_pred == y_train)[0] * 100 / len(y_train),"%")
# testing accuracy
m2  = test_x.shape[0]
X_test = np.append(np.ones((m2, 1)), test_x, axis=1)
y_test = test_y.reshape(m2, 1)
test_pred = predict(final_theta, X_test)
accuracy1 = sum(test_pred == y_test)[0] * 100 / len(y_test)
print("Test Accuracy:", sum(test_pred == y_test)[0] * 100 / len(y_test),"%")

Train Accuracy: 77.18360071301248 %
Test Accuracy: 77.00534759358288 %


**Recall and F1 Score**

In [53]:
tp = sum((y_test == 1) & (test_pred == 1))[0]
tn = sum((y_test == 0) & (test_pred == 0))[0]
fn = sum((y_test == 1) & (test_pred == 0))[0]
fp = sum((y_test == 0) & (test_pred == 1))[0]
precision1 = (tp)/ float( tp + fp)
recall1 = (tp)/ float( tp + fn)
fscore1 = (2 * precision1 * recall1) / (precision1 + recall1)
print("Recall: ", recall1 * 100, "%")
print("F1-score: ", fscore1) 

Recall:  9.523809523809524 %
F1-score:  0.1568627450980392


## Decision Tree

In [54]:
data = pd.read_csv('transfusion.data')
data.rename(columns={'whether he/she donated blood in March 2007': 'donated'}, inplace=True)
labels = data['donated'].to_numpy()
data.drop(columns='donated', inplace=True)
data.nunique()

Recency (months)         31
Frequency (times)        33
Monetary (c.c. blood)    33
Time (months)            78
dtype: int64

**Decision Tree Classifier (Using GINI Impurity)**

In [55]:
max_depth  = 8

In [56]:
class Node:
    def __init__(self, gini, num_samples, num_samples_per_class, predicted_class):
        self.gini = gini
        self.num_samples = num_samples
        self.num_samples_per_class = num_samples_per_class
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None

In [57]:
def find_best_split(X, y):
    m = y.size
    if m <= 1:
        return None, None

    num_parent = [np.sum(y == c) for c in range(2)]

    best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
    best_index = None
    best_threshold = None

    for idx in range(X.shape[1]):
        thresholds, classes = zip(*sorted(zip(X[:, idx], y)))

        num_left = [0] * 2
        num_right = num_parent.copy()
        for i in range(1, m):  
            c = classes[i - 1]
            num_left[c] += 1
            num_right[c] -= 1
            gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(2))
            gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(2))

            gini = (i * gini_left + (m - i) * gini_right) / m

            if thresholds[i] == thresholds[i - 1]:
                continue

            if gini < best_gini:
                best_gini = gini
                best_index = idx
                best_threshold = (thresholds[i] + thresholds[i - 1]) / 2 

    return best_index, best_threshold

def gini(y):
  m = y.size
  return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in range(2))

def build_tree(X, y, depth = 0):
  num_samples_per_class = [np.sum(y == i) for i in range(2)]
  predicted_class = np.argmax(num_samples_per_class)
  node = Node(gini=gini(y),num_samples=y.size,num_samples_per_class=num_samples_per_class,predicted_class=predicted_class)

  if depth < max_depth:
      index, threshold = find_best_split(X, y)
      if index is not None:
          indices_left = X[:, index] < threshold
          X_left, y_left = X[indices_left], y[indices_left]
          X_right, y_right = X[~indices_left], y[~indices_left]
          node.feature_index = index
          node.threshold = threshold
          node.left = build_tree(X_left, y_left, depth + 1)
          node.right = build_tree(X_right, y_right, depth + 1)
  return node

def predict_tree(X, tree):
  result = []
  for input in X:
    node = tree 
    while node.left:
      if input[node.feature_index] < node.threshold:
          node = node.left
      else:
          node = node.right
    result.append(node.predicted_class)
  return np.array(result)

**Train/Test Split**

In [58]:
# randomly choose 0.75 of the dataset size for train indices
train_indices = random.sample(range(0, len(data)), int(0.75 * len(data)))
test_indices  = list(set(range(0, len(data))) - set(train_indices))
df_np = data.to_numpy()
train_x_2 = df_np[train_indices]
train_y_2 = labels[train_indices]
test_x_2 = df_np[test_indices]
test_y_2 = labels[test_indices]
print('train_x shape:', train_x_2.shape)
print('train_y shape:', train_y_2.shape)
print('test_x shape:', test_x_2.shape)
print('test_y shape:', test_y_2.shape)

train_x shape: (561, 4)
train_y shape: (561,)
test_x shape: (187, 4)
test_y shape: (187,)


**Accuracy**

In [59]:
final_tree = build_tree(train_x_2, train_y_2)
train_pred_2 =  predict_tree(train_x_2, final_tree)
print("Train Accuracy:", sum(train_pred_2 == train_y_2) * 100 / len(train_y_2),"%")
test_pred_2 =  predict_tree(test_x_2, final_tree)
accuracy2 = sum(test_pred_2 == test_y_2) * 100 / len(test_pred_2)
print("Test Accuracy:", sum(test_pred_2 == test_y_2) * 100 / len(test_pred_2),"%")

Train Accuracy: 88.05704099821747 %
Test Accuracy: 73.2620320855615 %


**Recall and F1 Score**

In [60]:
tp = sum((test_y_2 == 1) & (test_pred_2 == 1))
tn = sum((test_y_2 == 0) & (test_pred_2 == 0))
fn = sum((test_y_2 == 1) & (test_pred_2 == 0))
fp = sum((test_y_2 == 0) & (test_pred_2 == 1))
precision2 = (tp)/ float( tp + fp)
recall2 = (tp)/ float( tp + fn)
fscore2 = (2 * precision2 * recall2) / (precision2 + recall2)
print("Recall: ", recall2 * 100, "%")
print("F1-score: ", fscore2) 

Recall:  20.454545454545457 %
F1-score:  0.2647058823529412


# Part 2

Let's have a recap to our results in part 1:

In [61]:
print('Accuracy of Logistic Regression:', accuracy1.round(2), '%')
print('Recall of Logistic Regression:', recall1.round(2) * 100, '%')
print('F1-score of Logistic Regression:', fscore1.round(2))
print('---------------------')
print('Accuracy of Decision Tree:', accuracy2.round(2), '%')
print('Recall of Decision Tree:', recall2.round(2) * 100, '%')
print('F1-score of Decision Tree:', fscore2.round(2))

Accuracy of Logistic Regression: 77.01 %
Recall of Logistic Regression: 10.0 %
F1-score of Logistic Regression: 0.16
---------------------
Accuracy of Decision Tree: 73.26 %
Recall of Decision Tree: 20.0 %
F1-score of Decision Tree: 0.26


Recall is a measure of how many of the positive cases the classifier correctly predicted, over all the positive cases in the data and F1-score a combination of both recall and precision and when recall is low, then we expect the F1-score to becomes low too.



In [62]:
np.unique(labels, return_counts=True)

(array([0, 1]), array([570, 178]))

As we can see in the above code output, the number of samples from class 0 is almost 3 times more than the number of samples in class 1. Therefore we have a low recall, because recall is all dependent on positive samples (from class 1).

They way to solve this problem, is to make the number of samples from each class equal. Duplicationg the samples in class 1 for 2 times is a good idea, because in this case we would somehow give **weight 3** to the samples in class 


# Part 3

According to what we discussed in part 2, let's first duplicate the samples in class 1 for 2 times in the dataset.

In [63]:
all_data = pd.read_csv('transfusion.data')
all_data.rename(columns={'whether he/she donated blood in March 2007': 'donated'}, inplace=True)
all_data['Monetary (c.c. blood)'] = np.log(all_data['Monetary (c.c. blood)']) # little normalizing
class1 = all_data[(all_data['donated'] == 1)]
all_data = all_data.append(class1)
all_data = all_data.append(class1)
print('New data shape:', all_data.shape)
np.unique(all_data['donated'].to_numpy(), return_counts=True)

New data shape: (1104, 5)


(array([0, 1]), array([570, 534]))

As you can see above, the number of classes are almost equal to eachother

In [64]:
labels = all_data['donated'].to_numpy()
all_data.drop(columns='donated', inplace=True)

**Create train and test splits for logistic regression**

In [65]:
# randomly choose 0.75 of the dataset size for train indices
train_indices = random.sample(range(0, len(all_data)), int(0.75 * len(all_data)))
test_indices  = list(set(range(0, len(all_data))) - set(train_indices))
df_np = all_data.to_numpy()
train_x_3 = df_np[train_indices]
train_y_3 = labels[train_indices].reshape(-1, 1)
test_x_3 = df_np[test_indices]
test_y_3 = labels[test_indices].reshape(-1, 1)
print('train_x shape:', train_x_3.shape)
print('train_y shape:', train_y_3.shape)
print('test_x shape:', test_x_3.shape)
print('test_y shape:', test_y_3.shape)

train_x shape: (828, 4)
train_y shape: (828, 1)
test_x shape: (276, 4)
test_y shape: (276, 1)


**Create train and test splits for Decition Tree**

In [66]:
# randomly choose 0.75 of the dataset size for train indices
train_indices = random.sample(range(0, len(all_data)), int(0.75 * len(all_data)))
test_indices  = list(set(range(0, len(all_data))) - set(train_indices))
df_np = all_data.to_numpy()
train_x_4 = df_np[train_indices]
train_y_4 = labels[train_indices]
test_x_4 = df_np[test_indices]
test_y_4 = labels[test_indices]
print('train_x shape:', train_x_4.shape)
print('train_y shape:', train_y_4.shape)
print('test_x shape:', test_x_4.shape)
print('test_y shape:', test_y_4.shape)

train_x shape: (828, 4)
train_y shape: (828,)
test_x shape: (276, 4)
test_y shape: (276,)


## Logistic Regression Classifier

In [67]:
m1 , n1 = train_x_3.shape[0], train_x_3.shape[1]
X_train_3 = np.append(np.ones((m1, 1)), train_x_3, axis=1)
y_train_3 = train_y_3.reshape(m1, 1)
initial_theta = np.zeros((n1 + 1, 1))
final_theta_new = gradient_descent(X_train_3, y_train_3, theta=initial_theta, lr=0.001, iterations=10000)

**Accuracy**

In [68]:
# trainning accuracy
train_pred_new = predict(final_theta_new, X_train_3)
print("Train Accuracy:", sum(train_pred_new == y_train_3)[0] * 100 / len(y_train_3),"%")
# testing accuracy
m2  = test_x_3.shape[0]
X_test_3 = np.append(np.ones((m2, 1)), test_x_3, axis=1)
y_test_3 = test_y_3.reshape(m2, 1)
test_pred_new = predict(final_theta_new, X_test_3)
accuracy3 = sum(test_pred_new == y_test_3)[0] * 100 / len(y_test_3)
print("Test Accuracy:", sum(test_pred_new == y_test_3)[0] * 100 / len(y_test_3),"%")

Train Accuracy: 70.41062801932367 %
Test Accuracy: 69.56521739130434 %


**Recall and F1 Score**

In [69]:
tp = sum((y_test_3 == 1) & (test_pred_new == 1))[0]
tn = sum((y_test_3 == 0) & (test_pred_new == 0))[0]
fn = sum((y_test_3 == 1) & (test_pred_new == 0))[0]
fp = sum((y_test_3 == 0) & (test_pred_new == 1))[0]
precision3 = (tp)/ float( tp + fp)
recall3 = (tp)/ float( tp + fn)
fscore3 = (2 * precision3 * recall3) / (precision3 + recall3)
print("Recall previous: ", recall1 * 100, "%")
print("Recall now: ", recall3 * 100, "%")
print("F1-score previous: ", fscore1) 
print("F1-score now: ", fscore3) 

Recall previous:  9.523809523809524 %
Recall now:  77.6923076923077 %
F1-score previous:  0.1568627450980392
F1-score now:  0.7062937062937064


We can clearly see the improvement of Recall and F1-score. now let's also check for decition tree.

## Decision Tree

In [70]:
max_depth = 8

**Accuracy**

In [71]:
final_tree_new = build_tree(train_x_4, train_y_4)
train_pred_4 =  predict_tree(train_x_4, final_tree_new)
print("Train Accuracy:", sum(train_pred_4 == train_y_4) * 100 / len(train_y_4),"%")
test_pred_4 =  predict_tree(test_x_4, final_tree)
accuracy4 = sum(test_pred_4 == test_y_4) * 100 / len(test_pred_4)
print("Test Accuracy:", sum(test_pred_4 == test_y_4) * 100 / len(test_pred_4),"%")

Train Accuracy: 82.97101449275362 %
Test Accuracy: 71.73913043478261 %


**Recall and F1 Score**

In [72]:
tp = sum((test_y_4 == 1) & (test_pred_4 == 1))
tn = sum((test_y_4 == 0) & (test_pred_4 == 0))
fn = sum((test_y_4 == 1) & (test_pred_4 == 0))
fp = sum((test_y_4 == 0) & (test_pred_4 == 1))
precision4 = (tp)/ float( tp + fp)
recall4 = (tp)/ float( tp + fn)
fscore4 = (2 * precision4 * recall4) / (precision4 + recall4)
print("Recall previous: ", recall2 * 100, "%")
print("Recall now: ", recall4 * 100, "%")
print("F1-score previous: ", fscore2) 
print("F1-score now: ", fscore4) 

Recall previous:  20.454545454545457 %
Recall now:  47.794117647058826 %
F1-score previous:  0.2647058823529412
F1-score now:  0.625


We can also see the improvement for decision tree

# Part 4

In [73]:
weights = np.full(len(train_x_2), 1 / len(train_x_2))
M = 10
max_depth = 1
models = []
alphas = []
for j in range(M):
  sample_indices = np.random.choice(len(train_x_2), size = len(train_x_2), replace = True, p = weights)
  sample_train_x = train_x_2[sample_indices, :]
  sample_train_y = train_y_2[sample_indices]
  adaboost_tree = build_tree(sample_train_x, sample_train_y)
  sample_train_pred =  predict_tree(sample_train_x, adaboost_tree)
  j_m = np.sum([weights[i] * (sample_train_pred[i] != sample_train_y[i]) for i in range(len(weights))])
  e_m = j_m / np.sum(weights)
  alpha_m = np.log((1 - e_m) / e_m)
  weights = [weights[i] * np.exp(alpha_m * (sample_train_pred[i] != sample_train_y[i])) for i in range(len(weights))] 
  weights = weights / np.sum(weights)
  models.append(adaboost_tree)
  alphas.append(alpha_m)

In [74]:
# prediction for test set
preds = []
for i in range(M):
  adaboost_test_pred =  predict_tree(test_x_2, models[i])
  preds.append(alphas[i] * adaboost_test_pred)
result = np.zeros((test_y_2.shape))
for i in range(len(preds)):
  result += preds[i]
final_result = np.sign(result)

In [75]:
print("Test Accuracy Adaboost:", sum(final_result == test_y_2) * 100 / len(test_y_2),"%")

Test Accuracy Adaboost: 74.33155080213903 %


In [76]:
tp = sum((test_y_2 == 1) & (final_result == 1))
tn = sum((test_y_2 == 0) & (final_result == 0))
fn = sum((test_y_2 == 1) & (final_result == 0))
fp = sum((test_y_2 == 0) & (final_result == 1))
precision = (tp)/ float( tp + fp)
recall5 = (tp)/ float( tp + fn)
fscore5 = (2 * precision * recall5) / (precision + recall5)
print("Recall: ", recall5 * 100, "%")
print("F1-score: ", fscore5) 

Recall:  13.636363636363635 %
F1-score:  0.19999999999999998


**Let's try the idea for part 2 here**

In [77]:
weights = np.full(len(train_x_4), 1 / len(train_x_4))
M = 10
max_depth = 1
models = []
alphas = []
for j in range(M):
  sample_indices = np.random.choice(len(train_x_4), size = len(train_x_4), replace = True, p = weights)
  sample_train_x = train_x_4[sample_indices, :]
  sample_train_y = train_y_4[sample_indices]
  adaboost_tree = build_tree(sample_train_x, sample_train_y)
  sample_train_pred =  predict_tree(sample_train_x, adaboost_tree)
  j_m = np.sum([weights[i] * (sample_train_pred[i] != sample_train_y[i]) for i in range(len(weights))])
  e_m = j_m / np.sum(weights)
  alpha_m = np.log((1 - e_m) / e_m)
  weights = [weights[i] * np.exp(alpha_m * (sample_train_pred[i] != sample_train_y[i])) for i in range(len(weights))] 
  weights = weights / np.sum(weights)
  models.append(adaboost_tree)
  alphas.append(alpha_m)

In [78]:
# prediction for test set
preds = []
for i in range(M):
  adaboost_test_pred =  predict_tree(test_x_4, models[i])
  preds.append(alphas[i] * adaboost_test_pred)
result = np.zeros((test_y_4.shape))
for i in range(len(preds)):
  result += preds[i]
final_result = np.sign(result)

In [79]:
print("Test Accuracy Adaboost:", sum(final_result == test_y_4) * 100 / len(test_y_4),"%")

Test Accuracy Adaboost: 61.594202898550726 %


In [81]:
tp = sum((test_y_4 == 1) & (final_result == 1))
tn = sum((test_y_4 == 0) & (final_result == 0))
fn = sum((test_y_4 == 1) & (final_result == 0))
fp = sum((test_y_4 == 0) & (final_result == 1))
precision = (tp)/ float( tp + fp)
recall6 = (tp)/ float( tp + fn)
fscore6 = (2 * precision * recall6) / (precision + recall6)
print("Recall previous: ", recall5 * 100, "%")
print("Recall now: ", recall6 * 100, "%")
print("F1-score previos: ", fscore5)
print("F1-score now: ", fscore6) 

Recall previous:  13.636363636363635 %
Recall now:  89.70588235294117 %
F1-score previos:  0.19999999999999998
F1-score now:  0.6971428571428571


We can also see the improvement here too!