# Import Bibliotheken

In [274]:
# Festlegung des Device
import platform

# Laden der Daten
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

# Operationen mit Texten
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import spacy

# Modell
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

# Evaluierung
import torchmetrics

# Auswertung der Evaluation
import pandas as pd
import altair as alt

from sklearn import preprocessing

### Package Versionen

In [275]:
for package in [torchtext,torch, torchmetrics, spacy, alt, pd]:
    print(f'{package.__name__}: {package.__version__}')

torchtext: 0.13.1
torch: 1.12.1
torchmetrics: 0.9.3
spacy: 3.4.2
altair: 4.2.0
pandas: 1.4.3


# Device Auswahl

In [276]:
device = torch.device(
    "cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Daten

## Dataset

### Erstellung Dataset

In [277]:
comment_df = pd.read_excel('data/clean/Google_Rezensionen.xlsx')
comment_df

Unnamed: 0.1,Unnamed: 0,Tankstellenname,Datum,Bewertung,Kommentar,Kategorien
0,4,TS Schleswig MC,2022-09-30 06:46:42,5,Super nettes Personal gutes Frühstück mit groß...,"Personal,Shop,Bistro"
1,5,TS Prisdorf,2022-09-29 17:22:44,5,Sehr höfliche Bedienstete Translated by Google...,Personal
2,6,TS Wanderup,2022-09-29 15:38:02,5,Immer gerne dort nettes und zuvorkommendes Per...,"Personal,Waschanlage"
3,10,TS Handewitt,2022-09-28 17:05:28,5,Nettes freundliches Personal Translated by Goo...,Personal
4,12,TS Bremen,2022-09-28 04:24:34,4,Normale Tankstelle die aber in der Tank App ni...,"Pricing,DigitalFueling"
...,...,...,...,...,...,...
3992,11072,TS Jübek,2015-04-07 17:45:02,5,Dies ist einfach die beste tankstelle die ich ...,"Kraftstoffauswahl,Waschanlage"
3993,11077,TS Jübek,2014-07-16 07:08:37,5,Sehr gut Günstig guter shop und einige andere ...,"Pricing,Shop,Kraftstoffauswahl,Waschanlage,Sta..."
3994,11079,TS Neustadt am Rüb.,2013-09-21 15:26:28,5,Öffnungszeiten Mo So 06 00 22 00 Uhr,Öffnungszeiten
3995,11080,TS Handewitt(SP),2012-11-05 11:16:00,5,genug platz fuer wohnwagen rechts beim autogas...,"Personal,Kraftstoffauswahl"


In [278]:
class TeamGoogleBewertungenDataSet(Dataset):

    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = comment_df.iloc[index]['Kommentar']
        category = comment_df.iloc[index]['Kategorien']
        return text, category

### Bestimmung Test- und Traningsdaten

In [279]:
# Länge der Test- und Traningsdaten bestimmen
dataset = TeamGoogleBewertungenDataSet(comment_df)
len_train = round(len(dataset)* 0.7)
len_test = round(len(dataset) * 0.3)
assert len(comment_df) == len_train + len_test

In [280]:
# Zufällige Aufsplittung der Anzahl von Test- und Trainingsdaten
train_set, test_set = torch.utils.data.random_split(dataset, [len_train, len_test]) 

In [281]:
next(iter(train_set))

('Scandinavian Park Gas station you drive from E 45 direction Denmark about 4 km after the German boarder and you enter the industrial zone inside the Gas station is like a restaurant there you get different food and drinks I eat potato salad and breathed fish the fish was small and ok if somebody are hungry the Potato salad was good but also small the price included a drink as for a Station is inside the norm the Employees are friendly toilets and shower room are clean May the Lord Give the knowledge the People with great appetite are coming and small food is to small for normal stomach ',
 'DigitalFueling')

## Dataloader

### Tokenisierung

In [282]:
tokenizer = get_tokenizer(
    'spacy', 
    language='de_core_news_lg')

In [283]:
# Beispiel für Anwendung des tokenizer
sentence = "This isn't a very long example."
tokenizer(sentence)

['This', "isn't", 'a', 'very', 'long', 'example', '.']

### Vokabular erstellen

In [284]:
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

In [285]:
len(train_set)

2798

In [286]:
tokens = yield_tokens(train_set)

In [287]:
next(iter(tokens))

['Scandinavian',
 'Park',
 'Gas',
 'station',
 'you',
 'drive',
 'from',
 'E',
 '45',
 'direction',
 'Denmark',
 'about',
 '4',
 'km',
 'after',
 'the',
 'German',
 'boarder',
 'and',
 'you',
 'enter',
 'the',
 'industrial',
 'zone',
 'inside',
 'the',
 'Gas',
 'station',
 'is',
 'like',
 'a',
 'restaurant',
 'there',
 'you',
 'get',
 'different',
 'food',
 'and',
 'drinks',
 'I',
 'eat',
 'potato',
 'salad',
 'and',
 'breathed',
 'fish',
 'the',
 'fish',
 'was',
 'small',
 'and',
 'ok',
 'if',
 'somebody',
 'are',
 'hungry',
 'the',
 'Potato',
 'salad',
 'was',
 'good',
 'but',
 'also',
 'small',
 'the',
 'price',
 'included',
 'a',
 'drink',
 'as',
 'for',
 'a',
 'Station',
 'is',
 'inside',
 'the',
 'norm',
 'the',
 'Employees',
 'are',
 'friendly',
 'toilets',
 'and',
 'shower',
 'room',
 'are',
 'clean',
 'May',
 'the',
 'Lord',
 'Give',
 'the',
 'knowledge',
 'the',
 'People',
 'with',
 'great',
 'appetite',
 'are',
 'coming',
 'and',
 'small',
 'food',
 'is',
 'to',
 'small',
 '

In [288]:
vocab = build_vocab_from_iterator(tokens, min_freq=2, specials=["<unk>"])

vocab.set_default_index(vocab["<unk>"])

In [289]:
len(vocab)

3160

In [290]:
vocab(['Hallo', 'sehr', 'Personal'])

[468, 7, 2]

In [291]:
vocab.lookup_token(0)

'<unk>'

### Encoding

In [292]:
onehot = torch.zeros(1,len(vocab))

In [293]:
pos = vocab(['nette', 'Bedienung', 'sauber'])
pos

[65, 53, 61]

In [294]:
def encode(text, vocab):
    tokens = tokenizer(text)
    onehot = torch.zeros(1,len(vocab))
    onehot[:,vocab(tokens)] = 1
    return onehot

### Multilabel Encoding

In [295]:
import torch
import torch.nn

In [296]:
df = comment_df.explode('Kategorien')
df

Unnamed: 0.1,Unnamed: 0,Tankstellenname,Datum,Bewertung,Kommentar,Kategorien
0,4,TS Schleswig MC,2022-09-30 06:46:42,5,Super nettes Personal gutes Frühstück mit groß...,"Personal,Shop,Bistro"
1,5,TS Prisdorf,2022-09-29 17:22:44,5,Sehr höfliche Bedienstete Translated by Google...,Personal
2,6,TS Wanderup,2022-09-29 15:38:02,5,Immer gerne dort nettes und zuvorkommendes Per...,"Personal,Waschanlage"
3,10,TS Handewitt,2022-09-28 17:05:28,5,Nettes freundliches Personal Translated by Goo...,Personal
4,12,TS Bremen,2022-09-28 04:24:34,4,Normale Tankstelle die aber in der Tank App ni...,"Pricing,DigitalFueling"
...,...,...,...,...,...,...
3992,11072,TS Jübek,2015-04-07 17:45:02,5,Dies ist einfach die beste tankstelle die ich ...,"Kraftstoffauswahl,Waschanlage"
3993,11077,TS Jübek,2014-07-16 07:08:37,5,Sehr gut Günstig guter shop und einige andere ...,"Pricing,Shop,Kraftstoffauswahl,Waschanlage,Sta..."
3994,11079,TS Neustadt am Rüb.,2013-09-21 15:26:28,5,Öffnungszeiten Mo So 06 00 22 00 Uhr,Öffnungszeiten
3995,11080,TS Handewitt(SP),2012-11-05 11:16:00,5,genug platz fuer wohnwagen rechts beim autogas...,"Personal,Kraftstoffauswahl"


#### Kategorien splitten

In [297]:
einzelLabels = comment_df['Kategorien'].str.split(',')
label_list_export = einzelLabels.to_list()
label_list_export

[['Personal', 'Shop', 'Bistro'],
 ['Personal'],
 ['Personal', 'Waschanlage'],
 ['Personal'],
 ['Pricing', 'DigitalFueling'],
 ['Erscheinungsbild', 'Personal'],
 ['Personal', 'Pricing'],
 ['Personal', 'Pricing', 'Kraftstoffauswahl'],
 ['Personal', 'Öffnungszeiten'],
 ['Personal', 'SB-Waschboxen', 'Waschanlage'],
 ['Shop'],
 ['Erscheinungsbild', 'Personal'],
 ['Personal', 'Waschanlage'],
 ['Personal', 'Pricing'],
 ['Personal', 'Pricing'],
 ['Bistro'],
 ['Personal', 'Pricing', 'Shop', 'Waschanlage'],
 ['Personal'],
 ['Personal'],
 ['Bistro'],
 ['Erscheinungsbild'],
 ['Personal', 'Pricing'],
 ['Personal'],
 ['Personal'],
 ['Personal', 'Shop'],
 ['Erscheinungsbild'],
 ['Personal', 'Bistro'],
 ['Erscheinungsbild', 'Nacht-/Tankautomat', 'Waschanlage', 'Staubsauger'],
 ['Pricing'],
 ['Personal'],
 ['Erscheinungsbild', 'Personal'],
 ['Personal'],
 ['Bistro'],
 ['Pricing', 'Kraftstoffauswahl'],
 ['Pricing'],
 ['Nacht-/Tankautomat'],
 ['Shop', 'Bistro'],
 ['Erscheinungsbild', 'Personal', 'Sanitär

##### In der nachfolgenden Zelle wird aus der verschachtelten Liste eine "flache Liste"

In [298]:
flat_list = []
for sublist in label_list_export:
    for item in sublist:
        flat_list.append(item)

flat_list

['Personal',
 'Shop',
 'Bistro',
 'Personal',
 'Personal',
 'Waschanlage',
 'Personal',
 'Pricing',
 'DigitalFueling',
 'Erscheinungsbild',
 'Personal',
 'Personal',
 'Pricing',
 'Personal',
 'Pricing',
 'Kraftstoffauswahl',
 'Personal',
 'Öffnungszeiten',
 'Personal',
 'SB-Waschboxen',
 'Waschanlage',
 'Shop',
 'Erscheinungsbild',
 'Personal',
 'Personal',
 'Waschanlage',
 'Personal',
 'Pricing',
 'Personal',
 'Pricing',
 'Bistro',
 'Personal',
 'Pricing',
 'Shop',
 'Waschanlage',
 'Personal',
 'Personal',
 'Bistro',
 'Erscheinungsbild',
 'Personal',
 'Pricing',
 'Personal',
 'Personal',
 'Personal',
 'Shop',
 'Erscheinungsbild',
 'Personal',
 'Bistro',
 'Erscheinungsbild',
 'Nacht-/Tankautomat',
 'Waschanlage',
 'Staubsauger',
 'Pricing',
 'Personal',
 'Erscheinungsbild',
 'Personal',
 'Personal',
 'Bistro',
 'Pricing',
 'Kraftstoffauswahl',
 'Pricing',
 'Nacht-/Tankautomat',
 'Shop',
 'Bistro',
 'Erscheinungsbild',
 'Personal',
 'Sanitär',
 'Bistro',
 'Pricing',
 'Verkehrsanbindung'

##### Doppelte Werte werden entfernt, so dass jeder Eintrag nur einmalig vorhanden ist. Dies erreichen wir mit set()

In [299]:
flat_set = set(flat_list)
flat_set

{'AdBlue',
 'Bistro',
 'DigitalFueling',
 'E-Fuels',
 'E-Mobilität',
 'Erscheinungsbild',
 'Kartenakzeptanzen',
 'Kraftstoffauswahl',
 'Luft',
 'Nacht-/Tankautomat',
 'Paketservice',
 'Personal',
 'Pricing',
 'SB-Waschboxen',
 'Sanitär',
 'Shop',
 'Staubsauger',
 'Tankpool',
 'Verkehrsanbindung',
 'WLAN',
 'Waschanlage',
 'Werkstatt',
 'Zapfsäulen',
 'Öffnungszeiten'}

In [300]:
unflat = [[x] for x in flat_set]

In [301]:
label_vocab = build_vocab_from_iterator(unflat, min_freq=1, specials=["<unk>"])

label_vocab.set_default_index(vocab["<unk>"])

In [302]:
label_vocab.lookup_token(3)

'DigitalFueling'

#### Dictionary erstellen

In [303]:
label_dict = {label:i for i, label in enumerate(flat_set)}
label_dict

{'Kraftstoffauswahl': 0,
 'Sanitär': 1,
 'Paketservice': 2,
 'AdBlue': 3,
 'E-Fuels': 4,
 'WLAN': 5,
 'Zapfsäulen': 6,
 'Shop': 7,
 'Öffnungszeiten': 8,
 'Pricing': 9,
 'SB-Waschboxen': 10,
 'Luft': 11,
 'Werkstatt': 12,
 'Staubsauger': 13,
 'Personal': 14,
 'Nacht-/Tankautomat': 15,
 'Bistro': 16,
 'Waschanlage': 17,
 'DigitalFueling': 18,
 'Erscheinungsbild': 19,
 'Verkehrsanbindung': 20,
 'E-Mobilität': 21,
 'Tankpool': 22,
 'Kartenakzeptanzen': 23}

#### Tensor

In [304]:
doc2 = label_list_export[2]
doc2

['Personal', 'Waschanlage']

In [305]:
pos_vals = [label_dict[val] for val in doc2]
pos_vals

[14, 17]

In [306]:
labels = torch.LongTensor(pos_vals)
labels

tensor([14, 17])

In [307]:
y_onehot = nn.functional.one_hot(labels, num_classes=24)
y_onehot

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [308]:
y_onehot = y_onehot.sum(dim=0).float()
y_onehot

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 0., 0., 0., 0., 0.])

In [309]:
# labels = torch.tensor([1,5,8,0])
# labels = labels.unsqueeze(0)
# target = torch.zeros(labels.size(0), 24).scatter_(1, labels, 1.)
# print(target)

### Erstellung Dataloader

In [310]:
# label_pipeline = lambda x: int(x) -1

In [311]:
def collate_batch(batch):
    label_list, text_list = [], []
 
    for (_text,_label) in batch:
    
        # Vorverarbeitung der Label
        # label_list.append(label_pipeline(_label))
        processed_labels = encode(_label, label_vocab)

        label_list.append(processed_labels)

        # Vorverarbeitung der Texte
        processed_text = encode(_text, vocab)
        
        # Zusammenführen sämtlicher Textrepräsentationen in einer Liste
        text_list.append(processed_text)
 
    # Zusammenführen aller Label in einem Tensor
    labels = torch.cat(label_list, dim=0)
    
    # Verbinden der Tensoren in text_list zu einem Tensor
    texts = torch.cat(text_list, dim = 0)

    # Ausgabe der Texte und der Label
    return texts.to(device), labels.to(device)

In [312]:
train_loader = DataLoader(
 train_set, batch_size=64,
 shuffle=True,
 collate_fn=collate_batch,
 num_workers=0
)

## Architektur und Training

### Architektur

In [313]:
class LinearTextClassificationModel(nn.Module):

    def __init__(self, vocab_size, num_class):
        super(LinearTextClassificationModel, self).__init__()
        self.fc1 = nn.Linear(vocab_size, 200)
        self.fc2 = nn.Linear(200, num_class)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [314]:
vocab_size = len(vocab)
num_class = 25
model = LinearTextClassificationModel(vocab_size, num_class).to(device)

### Training

In [315]:
# Hyperparameter

## Festlegung Lernrate
learning_rate = 0.05

## Initialisierung Fehlerfunktion
loss_fn = nn.CrossEntropyLoss()

## Initialisierung Fehlerfunktion
bce_loss = nn.BCEWithLogitsLoss()

## Initialisierung Optimizer
optimizer = torch.optim.SGD(model.parameters(),lr = learning_rate, momentum=0.9)

## Definition der Epochen
num_epochs = 20

In [316]:
train_accuracy = torchmetrics.Accuracy().to(device)

loss_hist = {}
accuracy_hist = {}

In [329]:
for (text, label) in train_loader:
    #print(label[[0][0]])
    #print(text)
    labelnew = label.type(torch.LongTensor)
    print(labelnew[[0][0]])

tensor([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])
tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0])
tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        1])
tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


### Training funktioniert nicht !!!

In [318]:
# Training
model.train()

for epoch in range(num_epochs):

    # Dokumentation Loss -> Erkennung ob Netz konvergiert
    running_loss = 0.0
    num_batches = 0

    for (text, label) in train_loader:
        num_batches += 1
        pred = model(text)

        # Der Fehler wird berechnet
        loss = bce_loss(pred, label)
        # Der Fehler wird über das Netz zurückpropagiert
        loss.backward()
        # Die Gewichte werden angepasst
        optimizer.step()
        # Gradienten zurücksetzen
        optimizer.zero_grad()
        ## Bestimmung der Accuracy für den Batch
        train_accuracy(pred, label.type(torch.LongTensor))
        
        # running loss
        running_loss +=loss.item()
 
    loss_hist[epoch] = running_loss/num_batches
 
    batch_train_accuracy = train_accuracy.compute()
    print(f"Training Accuracy for epoch {epoch}:{batch_train_accuracy}")
    accuracy_hist[epoch] = batch_train_accuracy.cpu().item()
    train_accuracy.reset()

Training Accuracy for epoch 0:0.907905638217926
Training Accuracy for epoch 1:0.907905638217926
Training Accuracy for epoch 2:0.907905638217926
Training Accuracy for epoch 3:0.907905638217926
Training Accuracy for epoch 4:0.907905638217926
Training Accuracy for epoch 5:0.907905638217926
Training Accuracy for epoch 6:0.907905638217926
Training Accuracy for epoch 7:0.907905638217926
Training Accuracy for epoch 8:0.907905638217926
Training Accuracy for epoch 9:0.907905638217926
Training Accuracy for epoch 10:0.907905638217926
Training Accuracy for epoch 11:0.907905638217926
Training Accuracy for epoch 12:0.907905638217926
Training Accuracy for epoch 13:0.907905638217926
Training Accuracy for epoch 14:0.907905638217926
Training Accuracy for epoch 15:0.907905638217926
Training Accuracy for epoch 16:0.907905638217926
Training Accuracy for epoch 17:0.907905638217926
Training Accuracy for epoch 18:0.907905638217926
Training Accuracy for epoch 19:0.907905638217926
Training Accuracy for epoch 20

### Auswertung Fehlerentwicklung

In [319]:
loss_df = pd.DataFrame.from_dict(loss_hist, orient= 'index').reset_index()

loss_df.columns = ['Epoch', 'Loss']
loss_df.head()

Unnamed: 0,Epoch,Loss
0,0,0.327476
1,1,0.297266
2,2,0.268809
3,3,0.25497
4,4,0.248112


In [330]:
train_chart = alt.Chart(loss_df).mark_line().encode(
    x=alt.X('Epoch', title = 'Anzahl Epochen'),
    y=alt.Y('Loss', title = 'Mittlerer Fehler')
)
# glue('train-loss-team', train_chart,display=True)
train_chart

### Auswertung Genauigkeit

In [321]:
accuracy_df = pd.DataFrame.from_dict(accuracy_hist, orient = 'index').reset_index()
accuracy_df.columns = ['Epoch', 'Accuracy']
accuracy_df.head()

Unnamed: 0,Epoch,Accuracy
0,0,0.907906
1,1,0.907906
2,2,0.907906
3,3,0.907906
4,4,0.907906


In [331]:
accuracy_chart = alt.Chart(accuracy_df).mark_line().encode(
    x=alt.X('Epoch',title = 'Anzahl Epochen'),
    y=alt.Y('Accuracy', title = 'Genauigkeit')
)
accuracy_chart

## Evaluierung

In [323]:
model.eval()

LinearTextClassificationModel(
  (fc1): Linear(in_features=3160, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=25, bias=True)
)

### Gesamtevaluierung

In [324]:
test_loader = DataLoader(test_set, batch_size= 64, shuffle=True,collate_fn=collate_batch)

In [326]:
valid_accuracy = torchmetrics.Accuracy().to(device)

with torch.no_grad():
    for (text, label) in test_loader:
        # Vorhersage wird für das Model erzeugt
        pred = model(text)
        valid_accuracy.update(pred, label.type(torch.LongTensor))
    total_valid_accuracy = valid_accuracy.compute()

In [327]:
total_valid_accuracy

tensor(0.9060)