In [1]:
matrix = []

with open("spambase.data", "r") as raw_data:
    for raw_line in raw_data:
        line = [float(x) for x in raw_line.split(",")]
        matrix.append(line)

In [2]:
import pandas as pd

In [3]:
data = pd.DataFrame(matrix)

In [4]:
row, col = data.shape

In [5]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [6]:
X, y = data.iloc[:,:col - 1], data[col - 1]

In [7]:
X.shape, y.shape

((4601, 57), (4601,))

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [10]:
len([x for x in y_train if x == 1.0])/len(y_train)

0.3939130434782609

In [11]:
logReg = LogisticRegression()

In [12]:
logReg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
logReg.predict_proba(X_test)

array([[0.02397271, 0.97602729],
       [0.01535504, 0.98464496],
       [0.82500247, 0.17499753],
       ...,
       [0.02469427, 0.97530573],
       [0.98855791, 0.01144209],
       [0.34863189, 0.65136811]])

In [14]:
logReg.score(X_train, y_train)

0.933623188405797

In [15]:
logReg.score(X_test, y_test)

0.9331016507384883

In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [17]:
confusion_matrix(logReg.predict(X_test), y_test)

array([[663,  43],
       [ 34, 411]])

In [18]:
tn, fp, fn, tp = confusion_matrix(logReg.predict(X_test), y_test).ravel()

In [19]:
tn, fp, fn, tp

(663, 43, 34, 411)

In [20]:
accuracy_score(logReg.predict(X_test), y_test)

0.9331016507384883

In [21]:
precision_score(logReg.predict(X_test), y_test)

0.9052863436123348

In [22]:
recall_score(logReg.predict(X_test), y_test)

0.9235955056179775

In [23]:
f1_score(logReg.predict(X_test), y_test)

0.914349276974416

In [24]:
cvLogReg = LogisticRegressionCV(cv=5, verbose=1)

In [25]:
cvLogReg.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.6s finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=1)

In [26]:
cvLogReg.score(X_test, y_test)

0.9322328410078193

In [27]:
cvLogReg.score(X_train, y_train)

0.9321739130434783

In [28]:
cvLogReg.coef_

array([[-2.45757329e-01, -1.45810518e-01,  1.13767173e-01,
         1.02489481e+00,  5.31016465e-01,  5.89645647e-01,
         2.50780285e+00,  5.74807342e-01,  1.60697585e+00,
         7.44774909e-02, -1.79883566e-01, -1.80949478e-01,
        -2.25410439e-01,  1.02312657e-01,  8.28306520e-01,
         1.05280069e+00,  1.17532594e+00,  1.27500468e-01,
         1.15668815e-01,  1.05409088e+00,  2.76806271e-01,
         3.71610710e-01,  2.45193409e+00,  3.67579381e-01,
        -1.64401085e+00, -1.11091448e+00, -3.94301936e+00,
         4.70512833e-01, -1.33602284e+00, -4.57550825e-01,
        -5.56814686e-01,  1.73943417e-02, -5.43840998e-01,
        -3.51466120e-01, -1.32420481e+00,  1.10329537e+00,
         4.18707766e-02, -5.30305793e-01, -7.08874814e-01,
        -5.15612708e-01, -1.72050315e+00, -2.16649071e+00,
        -8.29641680e-01, -1.44976930e+00, -5.61480749e-01,
        -1.37193695e+00, -9.24359310e-01, -1.67664822e+00,
        -1.52041955e+00, -2.49972699e-01, -4.95515172e-0

In [29]:
import numpy as np

In [30]:
feature_sort = sorted(X_train.columns, key=lambda x: -cvLogReg.coef_.reshape(-1, 1)[x])

In [31]:
feature_sort[0]

52

In [32]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [33]:
LDA = LinearDiscriminantAnalysis()

In [34]:
LDA.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [35]:
LDA.score(X_train, y_train)

0.8985507246376812

In [36]:
LDA.score(X_test, y_test)

0.894005212858384

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
neighbors = [3, 5, 7, 9]
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    print(f"Result for {n}-nearest neighbors")
    print("On train set")
    print(knn.score(X_train, y_train))
    print("On test set")
    print(knn.score(X_test, y_test))
    print("-----------------------------------")

Result for 3-nearest neighbors
On train set
0.9020289855072464
On test set
0.7975673327541268
-----------------------------------
Result for 5-nearest neighbors
On train set
0.8663768115942029
On test set
0.7862728062554301
-----------------------------------
Result for 7-nearest neighbors
On train set
0.8420289855072464
On test set
0.7949609035621199
-----------------------------------
Result for 9-nearest neighbors
On train set
0.8339130434782609
On test set
0.7862728062554301
-----------------------------------


In [39]:
knn.predict_proba(X_test)

array([[0.33333333, 0.66666667],
       [0.11111111, 0.88888889],
       [0.88888889, 0.11111111],
       ...,
       [0.88888889, 0.11111111],
       [0.55555556, 0.44444444],
       [0.55555556, 0.44444444]])

In [40]:
from sklearn.tree import DecisionTreeClassifier

In [41]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [42]:
dtc.score(X_test, y_test)

0.9157254561251086

In [43]:
dtc.score(X_train, y_train)

1.0

In [44]:
from sklearn.model_selection import cross_val_score
cross_val_score(dtc, X_train, y_train, cv=10)

array([0.91618497, 0.91304348, 0.90434783, 0.90724638, 0.92173913,
       0.89565217, 0.89275362, 0.88985507, 0.88405797, 0.90988372])

In [45]:
def euclideanDistance(p1, p2, length):
    dist = 0
    for i in range(length):
        dist += np.square(p1.iloc[i] - p2.iloc[i])
        
    return np.sqrt(dist)

In [46]:
def get_neighbors(training_set, labels, input_instance, k):
    distances = []
    for idx in range(training_set.shape[0]):
        instance = training_set.iloc[idx]
        dist = euclideanDistance(instance, input_instance, len(input_instance))
        distances.append((instance, dist, labels.iloc[idx]))
    distances.sort(key=lambda x: x[1])
    return distances[:k]

In [47]:
def get_class(training_set, labels, input_instance, k):
    neighbors = get_neighbors(training_set, labels, input_instance, k)
    classes = {}
    for n in neighbors:
        label = n[2]
        if label not in classes:
            classes[label] = 0
        classes[label] += 1
    
    for label in classes:
        classes[label] = (classes[label] * 1.0)/k
    
    return classes

In [48]:
print(get_class(X_train, y_train, X_test.iloc[0], 3))

{0.0: 0.6666666666666666, 1.0: 0.3333333333333333}


In [49]:
y_test.iloc[0]

1.0

In [50]:
1.0 == 1

True

In [51]:
import heapq

class BinaryKNNClassifier:
    def __init__(self, labels=[1,0], n_neighbors=5):
        self.labels = labels
        self.n_neighbors = n_neighbors
    
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def euclideanDistance(self, p1, p2):
        return np.linalg.norm(p1 - p2)
    
    def predict(self, test_set):
        result = []
        for i, test in test_set.iterrows():
            distances = []
            heapq.heapify(distances)
            
            for idx, train in X.iterrows():
                dist = self.euclideanDistance(test, train)
                if len(distances) < self.n_neighbors:
                    heapq.heappush(distances, (-dist, y.loc[idx]))
                elif distances[0][0] <= -dist:
                    heapq.heapreplace(distances, (-dist, y.loc[idx]))
                
            
            r = [0] * len(self.labels)
            for n in distances:
                r[int(n[1])] += 1
            result.append([x * 1.0/self.n_neighbors for x in r])
        return result

In [52]:
knnBinary = BinaryKNNClassifier(n_neighbors=9)

In [53]:
knnBinary.fit(X_train, y_train)

In [54]:
r = knnBinary.predict(X_test[:5])

In [55]:
r

[[0.2222222222222222, 0.7777777777777778],
 [0.1111111111111111, 0.8888888888888888],
 [0.8888888888888888, 0.1111111111111111],
 [0.7777777777777778, 0.2222222222222222],
 [1.0, 0.0]]

In [56]:
knn.predict_proba(X_test[:10])

array([[0.33333333, 0.66666667],
       [0.11111111, 0.88888889],
       [0.88888889, 0.11111111],
       [0.77777778, 0.22222222],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.44444444, 0.55555556],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333]])

In [57]:
knn.classes_

array([0., 1.])

In [58]:
# r = knnBinary.predict(X_test)
X_test.shape

(1151, 57)

In [59]:
X_train.shape

(3450, 57)

In [60]:
r

[[0.2222222222222222, 0.7777777777777778],
 [0.1111111111111111, 0.8888888888888888],
 [0.8888888888888888, 0.1111111111111111],
 [0.7777777777777778, 0.2222222222222222],
 [1.0, 0.0]]

In [163]:
## K fold cross validation
class KFoldCrossValidation:
    def __init__(self, model, labels=[0, 1], k=5):
        self.model = model
        self.labels = labels
        self.k = k
        
    def fit(self, X, y):
        self.X = X
        self.y = y
        
    def cross_validation(self, data_folds):
        score = []

        for i in range(self.k):
            train_test, label_test = data_folds[i]
            
            for j in range(self.k):
                if j != i:
                    train_data, label = data_folds[j]
                    self.model.fit(train_data, label)
                    s = self.model.score(train_test, label_test)

                    score.append(s)
        
        return score
    
    def kfold(self):
        size = int(self.X.shape[0] / self.k)

        i = 0
        result = []
        data_folds = []
        
        while i <= self.X.shape[0] - size:
            fold_i = self.X[i : i + size]
            label_i = self.y[i : i + size]
            data_folds.append((fold_i, label_i))
            i += size
            
        score = self.cross_validation(data_folds)
        return np.average(score)

In [164]:
knn = KNeighborsClassifier()
kfc = KFoldCrossValidation(model=knn, k=5)

In [165]:
kfc.fit(X_train, y_train)

In [166]:
r = kfc.kfold()

In [167]:
r

0.7430434782608695

In [168]:
cross_val_score(cv=3, X=X_train, y=y_train, estimator=knn)

array([0.77478261, 0.79130435, 0.80782609])

In [169]:
np.average(cross_val_score(cv=5, X=X_train, y=y_train, estimator=KNeighborsClassifier()))

0.7913071148793771