## **Part 1:**
### Build a classifier based on KNN (K=4 for testing) using Euclidean distance.

In [3]:
from csv import reader
from math import sqrt

#Parses csv and returns the data.
def load_csv(fileName):
	data = list()
	with open(fileName, 'r') as file:
		csv_reader = reader(file, delimiter = ';')
		for row in csv_reader:
			if not row:
				continue
			data.append(row)
	return data

#Transforms the string val to int value.
#!!This piece of algorithm (function str_to_int) is built up with help of other sources as internet!!
def str_to_val(dataset, column):
	class_values = [row[column] for row in dataset]
	unique = set(class_values)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

def str_to_int(dataset, column):
  for row in dataset:
    row[column] = int(row[column])

#Calculates distance between two rows.
def calculate_distance(x, y):
  distance = 0.0
  for i in range(len(x) - 2):
    distance += (x[i] - y[i])**2
  return sqrt(distance)  

#-MAIN---------------------------------

strings = [1,2,3,4,6,7,8,10,15,16]
ints = [0,5,9,11,12,13,14]
dataset = load_csv("bank-full.csv")
dataset.pop(0)

for val in strings:
  str_to_val(dataset, val)
for val in ints:
  str_to_int(dataset, val)

#--

In [4]:
confision_matrix = [[0,0],[0,0]]
result_distance = list()

print(f"Processing...")
for i in range(0,1000):

  if i%50 == 0:
    print(f"Calculating, current index= {i} , (1000 is end)")

  #1- calculate distances
  for j in range(int(len(dataset)/5),len(dataset)):
    result_distance.append((calculate_distance(dataset[i], dataset[j]),dataset[j]))

  #2- Sort distances
  result_distance.sort()

  #3- Take best outcome for first 4
  total_no = 0
  total_yes = 0
  for j in range(0,4):
    if result_distance[j][1][len(dataset[j])-1] == 1:
      total_no += 1
    else:
      total_yes +=1  

  #4- Fill confusion matrix
  if dataset[i][len(dataset[i])-1] == 0:
    if total_yes > total_no: #means match
      confision_matrix[0][0] += 1
    else:
      confision_matrix[0][1] += 1
  else:
    if total_yes < total_no: #means match
      confision_matrix[1][1] += 1
    else:
      confision_matrix[1][0] += 1

  result_distance.clear();
print("finished!")


Processing...
Calculating, current index= 0 , (1000 is end)
Calculating, current index= 50 , (1000 is end)
Calculating, current index= 100 , (1000 is end)
Calculating, current index= 150 , (1000 is end)
Calculating, current index= 200 , (1000 is end)
Calculating, current index= 250 , (1000 is end)
Calculating, current index= 300 , (1000 is end)
Calculating, current index= 350 , (1000 is end)
Calculating, current index= 400 , (1000 is end)
Calculating, current index= 450 , (1000 is end)
Calculating, current index= 500 , (1000 is end)
Calculating, current index= 550 , (1000 is end)
Calculating, current index= 600 , (1000 is end)
Calculating, current index= 650 , (1000 is end)
Calculating, current index= 700 , (1000 is end)
Calculating, current index= 750 , (1000 is end)
Calculating, current index= 800 , (1000 is end)
Calculating, current index= 850 , (1000 is end)
Calculating, current index= 900 , (1000 is end)
Calculating, current index= 950 , (1000 is end)
finished!


In [5]:
print(f"Yes--  accurate : {confision_matrix[0][0]} false: {confision_matrix[0][1]} -> accuracy = {confision_matrix[0][0]/(confision_matrix[0][0]+confision_matrix[0][1])}")
print(f" No-- accurate : {confision_matrix[1][1]} false: {confision_matrix[1][0]} -> accuracy = {confision_matrix[1][1]/(confision_matrix[1][0]+confision_matrix[1][1])}")


Yes--  accurate : 7 false: 12 -> accuracy = 0.3684210526315789
 No-- accurate : 909 false: 72 -> accuracy = 0.926605504587156


(1000 runs) 
This looks good enough for "no" class, but "yes" is less accurate. Reason for that is "yes" has less occurences in whole dataset compared to "no".

**It took 5 minutes to calculate!**

# **Part 2:**
## Build a classifier based on KNN (K=4 for testing) using Manhattan distance. 

In [164]:
def calculate_distance_manhattan(x, y):
  distance = 0.0
  for i in range(len(x) - 2):
    distance += abs(x[i] - y[i])
  return distance

#-MAIN---------------------------------
confision_matrix_2 = [[0,0],[0,0]]
result_distance = list()

print(f"Processing...")
for i in range(0,1000):

  if i%50 == 0:
    print(f"Calculating, current index= {i} , (1000 is end)")

  #1- calculate distances
  for j in range(int(len(dataset)/5),len(dataset)):
    result_distance.append((calculate_distance_manhattan(dataset[i], dataset[j]),dataset[j]))

  #2- Sort distances
  result_distance.sort()

  #3- Take best outcome for first 4
  total_no = 0
  total_yes = 0
  for j in range(0,4):
    if result_distance[j][1][len(dataset[j])-1] == 1:
      total_no += 1
    else:
      total_yes +=1  

  #4- Fill confusion matrix
  if dataset[i][len(dataset[i])-1] == 0:
    if total_yes > total_no: #means match
      confision_matrix_2[0][0] += 1
    else:
      confision_matrix_2[0][1] += 1
  else:
    if total_yes < total_no: #means match
      confision_matrix_2[1][1] += 1
    else:
      confision_matrix_2[1][0] += 1

  result_distance.clear();
print("finished!")

Processing...
Calculating, current index= 0 , (1000 is end)
Calculating, current index= 50 , (1000 is end)
Calculating, current index= 100 , (1000 is end)
Calculating, current index= 150 , (1000 is end)
Calculating, current index= 200 , (1000 is end)
Calculating, current index= 250 , (1000 is end)
Calculating, current index= 300 , (1000 is end)
Calculating, current index= 350 , (1000 is end)
Calculating, current index= 400 , (1000 is end)
Calculating, current index= 450 , (1000 is end)
Calculating, current index= 500 , (1000 is end)
Calculating, current index= 550 , (1000 is end)
Calculating, current index= 600 , (1000 is end)
Calculating, current index= 650 , (1000 is end)
Calculating, current index= 700 , (1000 is end)
Calculating, current index= 750 , (1000 is end)
Calculating, current index= 800 , (1000 is end)
Calculating, current index= 850 , (1000 is end)
Calculating, current index= 900 , (1000 is end)
Calculating, current index= 950 , (1000 is end)
finished!


In [165]:
print(f"Yes--  accurate : {confision_matrix_2[0][0]} false: {confision_matrix_2[0][1]} -> accuracy = {confision_matrix_2[0][0]/(confision_matrix_2[0][0]+confision_matrix_2[0][1])}")
print(f" No-- accurate : {confision_matrix_2[1][1]} false: {confision_matrix_2[1][0]} -> accuracy = {confision_matrix_2[1][1]/(confision_matrix_2[1][0]+confision_matrix_2[1][1])}")

Yes--  accurate : 7 false: 12 -> accuracy = 0.3684210526315789
 No-- accurate : 922 false: 59 -> accuracy = 0.9398572884811417


(1000 runs) This looks good enough for "no" class, but "yes" is way less accurate. Reason for that is "yes" has less occurences in whole dataset compared to "no".

**It took 4 minutes to calculate!**

# **Part 3:**
## Build a classifier based on linear SVM

In [66]:
from re import I
import numpy as np
from sklearn import svm

#--
params = np.array(dataset[0:1000])
res = list()
for i in range(0,1000): 
  res.append(dataset[i][len(dataset[i])-1])

clf = svm.SVC(kernel = 'linear', C = 1.0)
clf.fit(params,res)

SVC(kernel='linear')

In [67]:
confision_matrix_3 = [[0,0],[0,0]]

for i in range(1001,2001):
  #Fill confusion matrix
  ress = clf.predict([dataset[i]])
  if dataset[i][len(dataset[i])-1] == 0:
    if  ress == 0: #means match
      confision_matrix_3[0][0] += 1
    else:
      confision_matrix_3[0][1] += 1
  else:
    if ress == 1: #means match
      confision_matrix_3[1][1] += 1
    else:
      confision_matrix_3[1][0] += 1

In [68]:
print(f"Yes--  accurate : {confision_matrix_3[0][0]} false: {confision_matrix_3[0][1]} -> accuracy = {confision_matrix_3[0][0]/(confision_matrix_3[0][0]+confision_matrix_3[0][1])}")
print(f" No-- accurate : {confision_matrix_3[1][1]} false: {confision_matrix_3[1][0]} -> accuracy = {confision_matrix_3[1][1]/(confision_matrix_3[1][0]+confision_matrix_3[1][1])}")

Yes--  accurate : 27 false: 0 -> accuracy = 1.0
 No-- accurate : 952 false: 21 -> accuracy = 0.9784172661870504


(1000 runs, linear) This looks really good for both yes and no!

**It took 3 minutes to calculate!**

# **Part 4**
## Build a classifier based on polynomial SVM. 

In [69]:
clf2 = svm.SVC(kernel = 'poly', degree = 8)
clf2.fit(params,res)

SVC(degree=8, kernel='poly')

In [70]:
confision_matrix_4 = [[0,0],[0,0]]

for i in range(1001,2001):
  #Fill confusion matrix
  ress = clf2.predict([dataset[i]])
  if dataset[i][len(dataset[i])-1] == 0:
    if  ress == 0: #means match
      confision_matrix_4[0][0] += 1
    else:
      confision_matrix_4[0][1] += 1
  else:
    if ress == 1: #means match
      confision_matrix_4[1][1] += 1
    else:
      confision_matrix_4[1][0] += 1

In [71]:
print(f"Yes--  accurate : {confision_matrix_4[0][0]} false: {confision_matrix_4[0][1]} -> accuracy = {confision_matrix_4[0][0]/(confision_matrix_4[0][0]+confision_matrix_4[0][1])}")
print(f" No-- accurate : {confision_matrix_4[1][1]} false: {confision_matrix_4[1][0]} -> accuracy = {confision_matrix_4[1][1]/(confision_matrix_4[1][0]+confision_matrix_4[1][1])}")

Yes--  accurate : 1 false: 26 -> accuracy = 0.037037037037037035
 No-- accurate : 971 false: 2 -> accuracy = 0.9979445015416238


(1000 runs, polynomial, 8 degree) This looks really good for "no" class, but looks terrible for "yes".

**It took couple of seconds to calculate!**

# **Part 5**
## Build a classifier based on DT (Decision Trees). 

In [73]:
from sklearn import tree

clf3 = tree.DecisionTreeClassifier()
clf3 = clf3.fit(params,res)

In [74]:
confision_matrix_5 = [[0,0],[0,0]]

for i in range(1001,2001):
  #Fill confusion matrix
  ress = clf3.predict([dataset[i]])
  if dataset[i][len(dataset[i])-1] == 0:
    if  ress == 0: #means match
      confision_matrix_5[0][0] += 1
    else:
      confision_matrix_5[0][1] += 1
  else:
    if ress == 1: #means match
      confision_matrix_5[1][1] += 1
    else:
      confision_matrix_5[1][0] += 1

In [75]:
print(f"Yes--  accurate : {confision_matrix_5[0][0]} false: {confision_matrix_5[0][1]} -> accuracy = {confision_matrix_5[0][0]/(confision_matrix_5[0][0]+confision_matrix_5[0][1])}")
print(f" No-- accurate : {confision_matrix_5[1][1]} false: {confision_matrix_5[1][0]} -> accuracy = {confision_matrix_5[1][1]/(confision_matrix_5[1][0]+confision_matrix_5[1][1])}")

Yes--  accurate : 27 false: 0 -> accuracy = 1.0
 No-- accurate : 973 false: 0 -> accuracy = 1.0


(1000 runs, desicion tree) This is fully correct for both classes!

**It took couple of seconds to calculate!**