In [1]:
matrix = []

with open("spambase.data", "r") as raw_data:
    for raw_line in raw_data:
        line = [float(x) for x in raw_line.split(",")]
        matrix.append(line)

In [2]:
import pandas as pd

In [3]:
data = pd.DataFrame(matrix)

In [4]:
row, col = data.shape

In [5]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [6]:
X, y = data.iloc[:,:col - 1], data[col - 1]

In [7]:
X.shape, y.shape

((4601, 57), (4601,))

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [10]:
len([x for x in y_train if x == 1.0])/len(y_train)

0.3939130434782609

In [11]:
logReg = LogisticRegression()

In [12]:
logReg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
logReg.predict_proba(X_test)

array([[9.95611182e-01, 4.38881828e-03],
       [1.94124391e-01, 8.05875609e-01],
       [9.99924066e-01, 7.59340033e-05],
       ...,
       [9.69319195e-01, 3.06808052e-02],
       [4.95738664e-03, 9.95042613e-01],
       [9.99992551e-01, 7.44907763e-06]])

In [14]:
logReg.score(X_train, y_train)

0.9330434782608695

In [15]:
logReg.score(X_test, y_test)

0.9296264118158123

In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [17]:
confusion_matrix(logReg.predict(X_test), y_test)

array([[665,  49],
       [ 32, 405]])

In [18]:
tn, fp, fn, tp = confusion_matrix(logReg.predict(X_test), y_test).ravel()

In [19]:
tn, fp, fn, tp

(665, 49, 32, 405)

In [20]:
accuracy_score(logReg.predict(X_test), y_test)

0.9296264118158123

In [21]:
precision_score(logReg.predict(X_test), y_test)

0.8920704845814978

In [22]:
recall_score(logReg.predict(X_test), y_test)

0.9267734553775744

In [23]:
f1_score(logReg.predict(X_test), y_test)

0.9090909090909092

In [24]:
cvLogReg = LogisticRegressionCV(cv=5, verbose=1)

In [25]:
cvLogReg.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=1)

In [26]:
cvLogReg.score(X_test, y_test)

0.9339704604691572

In [27]:
cvLogReg.score(X_train, y_train)

0.9339130434782609

In [28]:
cvLogReg.coef_

array([[-2.84329963e-01, -1.32543195e-01,  1.54793963e-01,
         9.73060841e-01,  5.52730808e-01,  6.53861382e-01,
         2.86881616e+00,  7.05707129e-01,  5.98716677e-01,
         7.86341201e-02, -2.28702327e-01, -1.99099718e-01,
        -3.39517399e-02,  8.82179469e-02,  9.77552782e-01,
         9.39536966e-01,  9.40480136e-01,  5.73673116e-02,
         7.11815942e-02,  1.37971246e+00,  2.56369142e-01,
         3.26607824e-01,  2.73823980e+00,  2.80226751e-01,
        -1.93979080e+00, -9.56276170e-01, -4.20679305e+00,
         2.07121913e-01, -1.50221311e+00, -1.38775152e-01,
        -5.14573716e-01,  4.50349751e-02, -1.02271716e+00,
        -9.18995465e-03, -1.20041276e+00,  7.79275820e-01,
         1.48837021e-01, -4.70715256e-01, -8.99771693e-01,
        -4.33913436e-01, -1.47058787e+00, -2.20784306e+00,
        -9.59133924e-01, -1.53949152e+00, -6.61298365e-01,
        -1.51627475e+00, -9.58887783e-01, -1.73346749e+00,
        -1.41680725e+00,  3.86008164e-03, -4.64000707e-0

In [29]:
import numpy as np

In [30]:
feature_sort = sorted(X_train.columns, key=lambda x: -cvLogReg.coef_.reshape(-1, 1)[x])

In [31]:
feature_sort[0]

52

In [32]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [33]:
LDA = LinearDiscriminantAnalysis()

In [34]:
LDA.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [35]:
LDA.score(X_train, y_train)

0.8904347826086957

In [36]:
LDA.score(X_test, y_test)

0.8835794960903562

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [116]:
neighbors = [3, 5, 7, 9]
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    print(f"Result for {n}-nearest neighbors")
    print("On train set")
    print(knn.score(X_train, y_train))
    print("On test set")
    print(knn.score(X_test, y_test))
    print("-----------------------------------")

Result for 3-nearest neighbors
On train set
0.8950724637681159
On test set
0.8062554300608167
-----------------------------------
Result for 5-nearest neighbors
On train set
0.8628985507246377
On test set
0.7984361424847958
-----------------------------------
Result for 7-nearest neighbors
On train set
0.8414492753623188
On test set
0.8019113814074718
-----------------------------------
Result for 9-nearest neighbors
On train set
0.8307246376811595
On test set
0.8001737619461338
-----------------------------------


In [117]:
knn.predict_proba(X_test)

array([[1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       ...,
       [0.66666667, 0.33333333],
       [0.11111111, 0.88888889],
       [0.66666667, 0.33333333]])

In [39]:
from sklearn.tree import DecisionTreeClassifier

In [40]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [41]:
dtc.score(X_test, y_test)

0.89748045178106

In [42]:
dtc.score(X_train, y_train)

0.9997101449275362

In [43]:
from sklearn.model_selection import cross_val_score
cross_val_score(dtc, X_train, y_train, cv=10)

array([0.87283237, 0.91014493, 0.89275362, 0.91884058, 0.90144928,
       0.92463768, 0.92463768, 0.92173913, 0.89565217, 0.93023256])

In [104]:
def euclideanDistance(p1, p2, length):
    dist = 0
    for i in range(length):
        dist += np.square(p1.iloc[i] - p2.iloc[i])
        
    return np.sqrt(dist)

In [110]:
def get_neighbors(training_set, labels, input_instance, k):
    distances = []
    for idx in range(training_set.shape[0]):
        instance = training_set.iloc[idx]
        dist = euclideanDistance(instance, input_instance, len(input_instance))
        distances.append((instance, dist, labels.iloc[idx]))
    distances.sort(key=lambda x: x[1])
    return distances[:k]

In [111]:
def get_class(training_set, labels, input_instance, k):
    neighbors = get_neighbors(training_set, labels, input_instance, k)
    classes = {}
    for n in neighbors:
        label = n[2]
        if label not in classes:
            classes[label] = 0
        classes[label] += 1
    
    for label in classes:
        classes[label] = (classes[label] * 1.0)/k
    
    return classes

In [114]:
print(get_class(X_train, y_train, X_test.iloc[0], 3))

{0.0: 1.0}


In [118]:
y_test.iloc[0]

0.0

In [119]:
1.0 == 1

True

In [251]:
import heapq

class BinaryKNNClassifier:
    def __init__(self, labels=[1, 0], n_neighbors=5):
        self.labels = labels
        self.n_neighbors = n_neighbors
    
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def euclideanDistance(self, p1, p2):
        return np.linalg.norm(p1 - p2)
    
    def predict(self, test_set):
        result = []
        for i, test in test_set.iterrows():
            distances = []
            
            for idx, train in X.iterrows():
                dist = self.euclideanDistance(test, train)
                distances.append((dist, y.iloc[idx]))
                
            distances.sort(key=lambda x: x[0])
            nearest = distances[:self.n_neighbors]
            proba = {1: 0, 0: 0}
            for neighbor in nearest:
                proba[int(neighbor[1])] += 1
                
            print(proba)
            
            r = [(proba[x] * 1.0) / self.n_neighbors for x in proba]
#             print(r)
#         return result

In [252]:
knnBinary = BinaryKNNClassifier()

In [253]:
knnBinary.fit(X_train, y_train)

In [254]:
knnBinary.predict(X_test[4:5])

{1: 5, 0: 0}


In [246]:
knn.predict_proba(X_test[4:5])

array([[0.11111111, 0.88888889]])