In [1]:
# To find out where the pyspark
import findspark
findspark.init()

# Creating Spark Context
from pyspark import SparkContext
sc = SparkContext("local[4]", "first app")

import numpy as np

import time

import matplotlib.pyplot as plt

import math

import random

In [2]:
def parse(x):
    if x[-1] == 'chd':
        return ('column', 'no need')
    converted_x = []
    y_class = int(x[-1])
    x = x[:-1]
    
    for idx, xi in enumerate(x):
        if idx==0:
            continue
        if xi.isdigit():
            converted_x.append(int(xi))
        else:
            if '.' in xi:
                converted_x.append(float(xi))
            else:
                if xi=='Present':
                    converted_x.append(1)
                else:
                    converted_x.append(0)
        
    return (np.array(converted_x), y_class)

def sigmoid(x, w):
    if -np.dot(w, x) > 0:
        z = min(100, -np.dot(w, x))
    else :
        z = max(-100, -np.dot(w, x))
    return 1 / (1 + math.exp(z))

In [3]:
total_dataset = sc.textFile('cardiovascular.txt').map(lambda line : line.split(";")).map(parse).collect()

origin_dataset = total_dataset[1:]

random.shuffle(origin_dataset)

total_size = len(origin_dataset)
temp_test_size = int(total_size/10)

D = origin_dataset[0][0].size
k = 10
learning_rate = [1e+3, 1e+2, 1e+1, 1e+0, 1e-1, 1e-2, 1e-3]
learning_rate = learning_rate[3:4]
threshold = 1e-9
average_accuracies = []

for lr in learning_rate:
    total_accuracy = 0
    for iterate in range(k):
        test_dataset = origin_dataset[iterate*temp_test_size : (iterate+1)*temp_test_size]
        train_dataset = origin_dataset[:iterate*temp_test_size]\
                        + origin_dataset[(iterate+1)*temp_test_size:]
        
        test_size = len(test_dataset)
        train_size = len(train_dataset)
        
        test_dataset = sc.parallelize(test_dataset).cache()
        train_dataset = sc.parallelize(train_dataset).cache()
        
        w = sc.broadcast(np.random.randn(D))
#         w = sc.broadcast(np.random.permutation(D))
        prev_likelihood = train_dataset.map(lambda p: p[1] * np.log(sigmoid(p[0], w.value) + 1e-9) + (1 - p[1]) * np.log(1 - sigmoid(p[0], w.value) + 1e-9))\
                       .sum()
        
        while True:
            gradient = train_dataset.map(lambda p: (p[1] - sigmoid(p[0], w.value)) * p[0])\
                       .sum()
            w = sc.broadcast(w.value + lr * np.array(gradient))
            cur_likelihood = train_dataset.map(lambda p: p[1] * np.log(sigmoid(p[0], w.value) + 1e-9) + (1 - p[1]) * np.log(1 - sigmoid(p[0], w.value) + 1e-9))\
                       .sum()
            if abs(prev_likelihood - cur_likelihood) < threshold:
                break
            prev_likelihood = cur_likelihood

        print(f"{iterate+1}-fold cross validation")
        print(f"train_size : {train_size}, test_size : {test_size}, total_size : {total_size}, test_size + train_size : {test_size + train_size}, test_size + train_size == total_size : {test_size + train_size == total_size}")
        print(f"Final w: {w.value}")
        
        correct_count = test_dataset.map(lambda p: (p[1] == round(sigmoid(p[0], w.value))))\
                       .sum()

        accuracy = correct_count / test_size
        total_accuracy += accuracy

        print(f"correct_count : {correct_count}")
        print(f"test_size : {test_size}")
        print(f"ratio : {correct_count}/{test_size}")
        print(f"accuracy : {accuracy}")
        print()
    
    average_accuracy = total_accuracy/k
    average_accuracies.append(average_accuracy)
    print(f"when learning rate is : {lr}")
    print(f"{k}-fold cross validation average accuracy : {average_accuracy}")
    print()
    print("------------------------------------------------------------------------------------------------")
    print()

# plt.plot(learning_rate, average_accuracies)
# plt.title("logistic regression")
# plt.xlabel("learning_rate")
# plt.ylabel("average_accuracy")
# plt.show()

1-fold cross validation
train_size : 416, test_size : 46, total_size : 462, test_size + train_size : 462, test_size + train_size == total_size : True
Final w: [-1.78457025e+04  4.33059743e+01 -4.42376407e+02 -2.82540002e+03
 -6.77116928e+00 -7.00982069e+03 -3.48136262e+03 -1.93636210e+03
 -3.82250603e+03]
correct_count : 24
test_size : 46
ratio : 24/46
accuracy : 0.5217391304347826

2-fold cross validation
train_size : 416, test_size : 46, total_size : 462, test_size + train_size : 462, test_size + train_size == total_size : True
Final w: [-12665.15008573    960.422937      -50.99380914  -1043.61945933
     81.09079373  -5619.19443356  -2695.49190523   -352.09793779
    392.37642488]
correct_count : 28
test_size : 46
ratio : 28/46
accuracy : 0.6086956521739131

3-fold cross validation
train_size : 416, test_size : 46, total_size : 462, test_size + train_size : 462, test_size + train_size == total_size : True
Final w: [-1.13512291e+04  1.00050031e+03  1.12847769e+00 -1.11611665e+03
  9.