In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import pairwise_distances

In [2]:
df = pd.read_csv('mushrooms.csv')

In [3]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
X = pd.get_dummies(df,drop_first=False)
X = X.drop('class_e',axis=1)
X = X.rename(columns={"class_p": "class"})

In [5]:
grouped = X.groupby("class")
class1_sample = grouped.get_group(0).sample(n=400)
class2_sample = grouped.get_group(1).sample(n=400)
X = pd.concat([class1_sample, class2_sample])
X = X.sample(frac=1, random_state=42)

In [6]:
X_train1, X_test1 = train_test_split(X, test_size=0.20, random_state=42)
X_train2, X_test2 = train_test_split(X, test_size=0.20, random_state=123)
X_train3, X_test3 = train_test_split(X, test_size=0.20, random_state=7)

In [7]:
X_train1

Unnamed: 0,class,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
6620,1,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4849,1,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
5758,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4577,1,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
60,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2301,0,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1784,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
1605,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1206,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
X_train2

Unnamed: 0,class,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
5244,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1913,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,1,0,0,0,0,0
1386,0,0,0,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
1953,0,0,0,0,0,0,1,1,0,0,...,0,1,0,1,0,0,0,0,0,0
5798,1,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,0,0,0,0,0,0,1,1,0,0,...,0,1,0,1,0,0,0,0,0,0
6020,1,0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
3942,0,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4860,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


## Метрическая близость.

In [9]:
def algrotith1_2(train_df, test_df):
    S = pairwise_distances(test_df.values.tolist(), train_df.values.tolist(), metric = 'hamming')
    S = euclidean_distances(test_df.values.tolist(), train_df.values.tolist())
    F = pd.concat([train_df['class'], pd.DataFrame(S.T)], axis=1)
    not_poisoned = train_df["class"].value_counts()[0]
    poisoned = train_df["class"].value_counts()[1]
    F = F.groupby(["class"]).agg(["sum"]).div({0: not_poisoned, 1: poisoned}, axis='index')
    test_df["prediction"] = np.where(F.T[0] < F.T[1], 0, 1)
    return sum(test_df["prediction"] == test_df["class"]) / len(test_df)

In [10]:
algrotith1_2(X_train1.reset_index(drop=True), X_test1.reset_index(drop=True))

0.90625

In [11]:
algrotith1_2(X_train2.reset_index(drop=True), X_test2.reset_index(drop=True))

0.91875

In [12]:
algrotith1_2(X_train3.reset_index(drop=True), X_test3.reset_index(drop=True))

0.91875

In [13]:
def algrotith1_3(train_df, test_df):
    d = max(np.max(pairwise_distances(train_df.loc[train_df["class"] == 1].values.tolist(), 
                                       train_df.loc[train_df["class"] == 1].values.tolist(), metric = 'hamming')),
            np.max(pairwise_distances(train_df.loc[train_df["class"] == 0].values.tolist(), 
                                       train_df.loc[train_df["class"] == 0].values.tolist(), metric = 'hamming'))
    )
    S = 1 - pairwise_distances(test_df.values.tolist(), train_df.values.tolist(), metric = 'hamming') / d
    S = np.array([[S[i][j] if S[i][j] > 0 else 0 for j in range(len(S[0]))] for i in range(len(S))])

    F = pd.concat([train_df['class'], pd.DataFrame(S.T)], axis=1)
    not_poisoned = train_df["class"].value_counts()[0]
    poisoned = train_df["class"].value_counts()[1]
    F = F.groupby(["class"]).agg(["sum"]).div({0: not_poisoned, 1: poisoned}, axis='index')
    test_df["prediction"] = np.where(F.T[0] > F.T[1], 0, 1)
    return sum(test_df["prediction"] == test_df["class"]) / len(test_df)

In [14]:
algrotith1_3(X_train1.reset_index(drop=True), X_test1.reset_index(drop=True))

0.89375

In [15]:
algrotith1_3(X_train2.reset_index(drop=True), X_test2.reset_index(drop=True))

0.9

In [16]:
algrotith1_3(X_train3.reset_index(drop=True), X_test3.reset_index(drop=True))

0.9

## Метрическая прецедентность

In [17]:
def _m(i, a):
    def metric(x, y):
        return  (((-1) ** (x != y) * a[i]).sum() / a[i].sum())
    return metric
def algrotithm3_1(train_df, test_df):
    temp_test = test_df.drop('class',axis=1)
    temp_train = train_df.drop('class',axis=1)
    classes = np.unique(train_df["class"])
    points_by_classes_ = [temp_train[train_df["class"]==label] for label in np.unique(train_df["class"])]
    bi = np.array([X_class.mean(axis=0) for X_class in points_by_classes_])
    b = bi.mean(axis=0)
    a = np.abs(bi - b)
    S = [pairwise_distances(temp_test, X_class, metric=_m(i, a)) for i, X_class in enumerate(points_by_classes_)]
    F = np.array([np.max(class_dists, axis=1) for class_dists in S])
    ind = np.argmax(F, axis=0)
    test_df["prediction"] = classes[ind]
    return sum(test_df["prediction"] == test_df["class"]) / len(test_df)

In [18]:
algrotithm3_1(X_train1.reset_index(drop=True), X_test1.reset_index(drop=True))

0.99375

In [19]:
algrotithm3_1(X_train2.reset_index(drop=True), X_test2.reset_index(drop=True))

1.0

In [20]:
algrotithm3_1(X_train3.reset_index(drop=True), X_test3.reset_index(drop=True))

0.9875

## Метрическая близость на основе прецедентности

In [21]:
def _m(i, a):
    def metric(x, y):
        return  (((-1) ** (x != y) * a[i]).sum() / a[i].sum())
    return metric
def algrotithm3_2(train_df, test_df):
    temp_test = test_df.drop('class',axis=1)
    temp_train = train_df.drop('class',axis=1)
    classes = np.unique(train_df["class"])
    points_by_classes_ = [temp_train[train_df["class"]==label] for label in np.unique(train_df["class"])]
    bi = np.array([X_class.mean(axis=0) for X_class in points_by_classes_])
    b = bi.mean(axis=0)
    a = np.abs(bi - b)
    S = [(1 - pairwise_distances(temp_test, X_class, metric=_m(i, a))) for i, X_class in enumerate(points_by_classes_)]
    F = np.array([np.min(class_dists, axis=1) for class_dists in S])
    ind = np.argmin(F, axis=0)
    test_df["prediction"] = classes[ind]
    return sum(test_df["prediction"] == test_df["class"]) / len(test_df)

In [22]:
algrotithm3_2(X_train1.reset_index(drop=True), X_test1.reset_index(drop=True))

0.99375

In [23]:
algrotithm3_2(X_train2.reset_index(drop=True), X_test2.reset_index(drop=True))

1.0

In [24]:
algrotithm3_2(X_train3.reset_index(drop=True), X_test3.reset_index(drop=True))

0.9875

## Метрическое сходство на основе прецедентности

In [35]:
def _m(i, a, d):
    def metric(x, y):
        return  (1 - ((1-(((-1) ** (x != y) * a[i]).sum() / a[i].sum()))/d)) if (1 - ((1-(((-1) ** (x != y) * a[i]).sum() / a[i].sum()))/d)) > 0 else 0
    return metric
def _m_prev(i, a):
    def metric(x, y):
        return  1 - (((-1) ** (x != y) * a[i]).sum() / a[i].sum())
    return metric
def algrotithm3_3(train_df, test_df):
    temp_test = test_df.drop('class',axis=1)
    temp_train = train_df.drop('class',axis=1)
    classes = np.unique(train_df["class"])
    points_by_classes_ = [temp_train[train_df["class"]==label] for label in np.unique(train_df["class"])]
    bi = np.array([X_class.mean(axis=0) for X_class in points_by_classes_])
    b = bi.mean(axis=0)
    a = np.abs(bi - b)
    d = np.max([np.max(pairwise_distances(X_class, X_class, metric=_m_prev(i, a))) for i, X_class in enumerate(points_by_classes_)])
    S = [pairwise_distances(temp_test, X_class, metric=_m(i, a, d)) for i, X_class in enumerate(points_by_classes_)]
    
    F = np.array([np.max(class_dists, axis=1) for class_dists in S])
    ind = np.argmax(F, axis=0)
    test_df["prediction"] = classes[ind]
    return sum(test_df["prediction"] == test_df["class"]) / len(test_df)

In [36]:
algrotithm3_3(X_train1.reset_index(drop=True), X_test1.reset_index(drop=True))

0.99375

In [37]:
algrotithm3_3(X_train2.reset_index(drop=True), X_test2.reset_index(drop=True))

1.0

In [38]:
algrotithm3_3(X_train3.reset_index(drop=True), X_test3.reset_index(drop=True))

0.9875