<a href="https://colab.research.google.com/github/jansoe/introNLP/blob/main/LVQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Einführung LVQ


In [1]:
import sklearn.datasets
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from google.colab import data_table
data_table.enable_dataframe_formatter()

## Erstellung eines Beispieldatensatz

In [2]:
#@title Definiton der Beispiel

example_1 = { 
    'cluster_std': ((4, 2),(1, 1),(2, 2)),
    'centers': [(-6,-6), (0,0), (6,6)],
    'random_state': 3
}

example_2 = { 
    'cluster_std': ((1.5, 1),(1, 4),(1, 2)),
    'centers': [(-2, -4), (2, 0), (-2, 4)],
    'random_state': 3
}

example_3 = { 
    'cluster_std': ((1, 5),(1, 5),(1, 5)),
    'centers': [(-4,-2), (0,2), (4, 0)],
    'random_state': 3
}

In [3]:
feature_data, category = sklearn.datasets.make_blobs(n_samples = 200, **example_1)
features = ['feature1', 'feature2']

data = pd.DataFrame(feature_data, columns=features)
data['target'] = category
data['category'] = data.target.map(lambda x: 'cat_{x}'.format(x=x))
data.head()

Unnamed: 0,feature1,feature2,target,category
0,1.729417,0.694052,1,cat_1
1,6.565321,5.905686,2,cat_2
2,5.048201,6.593689,2,cat_2
3,-14.993031,-6.535524,0,cat_0
4,-0.26124,-1.298047,1,cat_1


#### Unterteilen der Daten in Trainings- und Validierungsdaten
* mit den Trainingsdaten lassen wir den Algorithmus lernen
* mit den Validierungsdaten können wir testen wie gut er auf neue Daten generalisiert

In [4]:
from sklearn.model_selection import train_test_split

train_data, validation_data = train_test_split(data, test_size=0.5, random_state=5)

Hinzufügen von "label noise", also einem falschen Trainingsdatenpunkt

In [5]:
add_mislabelled = False
if add_mislabelled:
    train_data.iloc[0] = [-0.4,-1.5, 0, 'cat_0']

In [6]:
#@title Visualisierung

fig_data = px.scatter(
    train_data.sort_values('category'), 
    x='feature1', y='feature2', color='category', 
    width=600, height=500)

fig_data_full = go.Figure(fig_data).add_trace(
    go.Scatter(
        x = validation_data['feature1'], 
        y = validation_data['feature2'],
        mode = 'markers',
        name = 'test',
        marker = {'color': validation_data.target, 'colorscale': ['blue', 'red', 'green'], 'opacity': 0.1}
    )
)
fig_data_full

## LVQ: Grundkonzepte

### Initalisieren der Prototypen

In [7]:
prototypes = train_data.groupby('category').head(1)
prototypes = prototypes.reset_index(drop=True)
prototypes

Unnamed: 0,feature1,feature2,target,category
0,-7.109553,-6.709518,0,cat_0
1,5.987402,9.001781,2,cat_2
2,0.128008,1.241617,1,cat_1


In [8]:
#@title Visualisierung

fig_init = go.Figure(fig_data).add_traces(
    go.Scatter(
        x = prototypes['feature1'],
        y = prototypes['feature2'],
        mode = 'markers',
        name = 'prototypes',
        marker={
            'color': prototypes.target,
            'colorscale': ['blue', 'red', 'green'],
            'size': 12,
            'symbol': 'x-open',
        }
    )
)
fig_init

### Definition des Abstandsmass

In [9]:
def distance_fn(vector, other_vectors):
    return np.sqrt(np.sum((vector - other_vectors)**2, axis=1))

### Vorhersage der Kategorie 

In [10]:
def predict(samples_df, prototype_df):
    for ix, sample in samples_df.iterrows():
        distances = distance_fn(sample[features], prototype_df[features])
        nearest_prototype = np.argmin(distances)
        samples_df.loc[ix, 'assigned_category'] = prototype_df.loc[nearest_prototype].category
    return samples_df

In [11]:
train_data = predict(train_data, prototypes)
train_data.head()

Unnamed: 0,feature1,feature2,target,category,assigned_category
35,-7.109553,-6.709518,0,cat_0,cat_0
70,5.987402,9.001781,2,cat_2,cat_2
79,0.128008,1.241617,1,cat_1,cat_1
92,-3.419752,-5.373879,0,cat_0,cat_0
97,-11.583054,-3.216674,0,cat_0,cat_0


In [12]:
#@title Visualisierung

fig_classification = go.Figure(fig_init)
fig_classification.add_trace(
 go.Scatter(
        
        x = train_data['feature1'],
        y = train_data['feature2'],
        mode = 'markers',
        name = 'assigned category',
        marker={
            'color': train_data.assigned_category.str.split('_').str.get(1).astype('int'),
            'colorscale': ['blue', 'red', 'green'],
            'size': 9,
            'symbol': 'circle-open',
        }
    )
)
#fig_classification.update_yaxes(
#    scaleanchor = "x",
#    scaleratio = 1,
#  )

### Update des Prototypen

Auswahl eines Trainingsdatenpunkt

In [13]:
train_sample = train_data.loc[169] #30

Bestimmung des nächsten Prototyp

In [14]:
distances = distance_fn(train_sample[features], prototypes[features])
nearest_prototype = np.argmin(distances)

In [15]:
#@title Visualisierung

def add_vector(fig, start, end, color='gray', size=1):
    fig.add_annotation(
        x=end[0],  # arrows' head
        y=end[1],  # arrows' head
        ax=start[0],  # arrows' tail
        ay=start[1],  # arrows' tail
        xref='x',
        yref='y',
        axref='x',
        ayref='y',
        text='',  # if you want only the arrow
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowwidth=size,
        arrowcolor=color
    )
    return fig

fig_update = go.Figure(fig_classification)
add_vector(start =[0,0], end = train_sample[features], fig= fig_update)
add_vector(start = [0,0], end = prototypes.loc[nearest_prototype, features], fig = fig_update)

Bestimmen der Richtung des Protoyp Updates

In [16]:
update_direction = (train_sample[features] - prototypes.loc[nearest_prototype, features])

In [17]:
#@title Visualisierung

add_vector(start = prototypes.loc[nearest_prototype, features], end = train_sample[features], fig = fig_update)

In [18]:
pull = train_sample.category == prototypes.loc[nearest_prototype].category
lernrate = 0.5

sign = 1 if pull else -1
updated_prototype = prototypes.loc[nearest_prototype, features] + sign * lernrate * update_direction

In [19]:
#@title Visualisierung
add_vector(start=prototypes.loc[nearest_prototype, features], end = updated_prototype, fig=go.Figure(fig_update), color='black', size=3)

## LVQ komplett

In [20]:
logging = []

prototypes = train_data.groupby('category').head(1)
prototypes = prototypes.reset_index(drop=True)
prototypes

Unnamed: 0,feature1,feature2,target,category,assigned_category
0,-7.109553,-6.709518,0,cat_0,cat_0
1,5.987402,9.001781,2,cat_2,cat_2
2,0.128008,1.241617,1,cat_1,cat_1


In [21]:
learning_rate = 0.01
num_epochs = 20 # Eine Epoche ist ein Durchlauf durch die Daten

for i in range(num_epochs):

    for ix, train_sample in train_data.iterrows():

        logging.append(prototypes.copy())

        # Compute distance from each prototype to this point ...
        distances = distance_fn(train_sample, prototypes[features])  
        # ... and select nearest
        nearest_prototype = np.argmin(distances)

        # Push or pull the prototype based on the label
        push_or_pull = 1 if train_sample.category == prototypes.loc[nearest_prototype].category else -1

        # direction
        update_direction = (train_sample[features] - prototypes.loc[nearest_prototype, features])

        # Update nearest prototype
        prototypes.loc[nearest_prototype, features] = prototypes.loc[nearest_prototype, features] + push_or_pull * learning_rate * update_direction


    train_data = predict(train_data, prototypes)
    print(f'Train Accuracy epoch_{i:02d}: {(train_data.category == train_data.assigned_category).mean():.3f}')

logging.append(prototypes.copy())
trajectory = pd.concat(logging)

Train Accuracy epoch_00: 0.890
Train Accuracy epoch_01: 0.900
Train Accuracy epoch_02: 0.920
Train Accuracy epoch_03: 0.930
Train Accuracy epoch_04: 0.940
Train Accuracy epoch_05: 0.940
Train Accuracy epoch_06: 0.940
Train Accuracy epoch_07: 0.940
Train Accuracy epoch_08: 0.940
Train Accuracy epoch_09: 0.940


In [22]:
#@title Visualisierung

go.Figure(fig_data).add_traces(
    [   
     go.Scatter(
            x=trajectory.loc[i, 'feature1'], 
            y=trajectory.loc[i, 'feature2'],
            line=dict(color="black")
        )
        for i in trajectory.index.unique()
    ] 
        +
    [
    go.Scatter(
        x = prototypes['feature1'],
        y = prototypes['feature2'],
        mode = 'markers',
        name = 'prototypes',
        marker={
            'color': prototypes.target,
            'colorscale': ['blue', 'red', 'green'],
            'size': 12,
            'symbol': 'x-open',
        }
    ),
    go.Scatter(
        x = train_data['feature1'],
        y = train_data['feature2'],
        mode = 'markers',
        name = 'assigned category',
        marker={
            'color': train_data.assigned_category.str.split('_').str.get(1).astype('int'),
            'colorscale': ['blue', 'red', 'green'],
            'size': 9,
            'symbol': 'circle-open',
        }
    )
])

## Generalisierungsfähigkeit


In [23]:
validation_data = predict(validation_data, prototypes)
print(f'Validation Accuracy: {(validation_data.category == validation_data.assigned_category).mean():.3f}')

Validation Accuracy: 0.950


In [24]:
#@title Visualisierung

fig_end = go.Figure(fig_data_full).add_traces(
    [
    go.Scatter(
        x = prototypes['feature1'],
        y = prototypes['feature2'],
        mode = 'markers',
        name = 'prototypes',
        marker={
            'color': prototypes.target,
            'colorscale': ['blue', 'red', 'green'],
            'size': 12,
            'symbol': 'x',
        }
    ),
    go.Scatter(
        x = train_data['feature1'],
        y = train_data['feature2'],
        mode = 'markers',
        name = 'assigned category',
        marker={
            'color': train_data.assigned_category.str.split('_').str.get(1).astype('int'),
            'colorscale': ['blue', 'red', 'green'],
            'size': 9,
            'symbol': 'circle-open',
        }
    ),
    go.Scatter(
        x = validation_data['feature1'],
        y = validation_data['feature2'],
        mode = 'markers',
        name = 'assigned category',
        marker={
            'color': validation_data.assigned_category.str.split('_').str.get(1).astype('int'),
            'colorscale': ['blue', 'red', 'green'],
            'size': 9,
            'symbol': 'circle-open',
        }
    )
])
fig_end