In [None]:
!pip3 install swifter
!pip3 install spacy
!pip3 install stop-words
!pip3 install emoji
!pip3 install HanTa
!python -m spacy download de
!pip3 install --upgrade pandas

# 1. Data Preprocessing

## 1.1. prerequisites

In [2]:
# Critical imports
import numpy
import pandas
import swifter
import spacy
import os
import gc
import emoji
import stop_words
import re

nlp = spacy.load('de')

In [3]:
base = "/content/drive/MyDrive/Colab Notebooks/rasa/data/"
filmstarts = base + "filmstarts.tsv"
holidaycheck = base + "holidaycheck.tsv"
germeval2017 = base + "germeval2017.tsv"
PotTS = base + "PotTS.tsv"
SB10k = base + "SB10k.tsv"

In [4]:
def filepath(outname):
    outdir = './data'
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)
    return fullname

In [5]:
def give_emoji_free_text(text):
    return emoji.get_emoji_regexp().sub(r'', text)


def sanitize(string, twitter):
    """ Sanitize one string """

    string = str(string)

    # remove graphical emoji
    string = give_emoji_free_text(string)

    if twitter:
        # remove user
        # assuming user has @ in front
        string = re.sub(r"""(?:@[\w_]+)""", '', string)

        #remove # and @
        for punc in '":!@#':
            string = string.replace(punc, '')

        # remove 't.co/' links
        string = re.sub(r'https//t.co\/[^\s]+', '', string, flags=re.MULTILINE)

    # spacy tokenizer
    string_split = list(nlp.tokenizer(string))

    if len(string_split) == 0:
        return numpy.nan
    else:
        return string_split


## 1.2. filmstarts dataset

In [6]:
def clean(x):
    if x >= 4:
        return "pos"
    elif x <= 2:
        return "neg"
    else:
        return numpy.nan


df = pandas.read_csv(filmstarts,
                     sep="\t",
                     on_bad_lines="skip",
                     names=["url", "rating", "text"])
df["sentiment"] = df['rating'].swifter.apply(clean)
df['text'] = df['text'].swifter.apply(lambda x: sanitize(x, False))
df = df.drop(["url", "rating"], axis=1)
df.dropna(inplace=True)
print("positive: " + str(df[df.sentiment == "pos"].shape[0]))
print("negative: " + str(df[df.sentiment == "neg"].shape[0]))
print("total: " + str(df.shape[0]))
df = df.reindex(columns=["text", "sentiment"])
df.to_csv(filepath("clean_filmstarts.csv"), sep=";", index=False, header=False)

Pandas Apply:   0%|          | 0/71174 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/71174 [00:00<?, ?it/s]

positive: 40011
negative: 15608
total: 55619


In [None]:
!cp ./data/clean_filmstarts.csv '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

## 1.3. holidaycheck dataset

In [None]:
def clean(x):
    if int(x) >= 5:
        return "pos"
    elif int(x) <= 3:
        return "neg"
    else:
        return numpy.nan


df = pandas.read_csv(holidaycheck,
                     sep="\t",
                     on_bad_lines="skip",
                     names=["rating", "text"])
df["sentiment"] = df["rating"].swifter.apply(clean)
df = df.drop(["rating"], axis=1)

# downsample to create balanced classes and save some ram
label = 'sentiment'
g = df.groupby(label, group_keys=False)
df = pandas.DataFrame(
    g.apply(lambda x: x.sample(g.size().min()))).reset_index(drop=True)


Dask Apply:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
# split into 10 chunks to fit data into ram
dfs = numpy.array_split(df, 10)
count = 0
for df in dfs:
    count += 1
    df.to_csv(filepath("holidaycheck_" + str(count) + ".csv"),
              sep=";",
              index=False,
              header=False)
    df = None
    del df
    gc.collect()

for i in range(1, 11):
    df = pandas.read_csv(filepath("holidaycheck_" + str(i) + ".csv"),
                         sep=";",
                         on_bad_lines="skip",
                         names=["text", "sentiment"])
    df['text'] = df['text'].swifter.apply(lambda x: sanitize(x, False))
    df.dropna(inplace=True)
    print("positive: " + str(df[df.sentiment == "pos"].shape[0]))
    print("negative: " + str(df[df.sentiment == "neg"].shape[0]))
    print("total: " + str(df.shape[0]))
    df = df.reindex(columns=["text", "sentiment"])
    df.to_csv(filepath("clean_holidaycheck.csv"),
              sep=";",
              index=False,
              mode='a',
              header=False)
    df = None
    del df
    gc.collect()


In [None]:
!cp ./data/clean_holidaycheck.csv '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

## 1.4. germeval2017 dataset

In [None]:
# germeval2017
def sentiment(x):
    if x == "positive":
        return "pos"
    elif x == "negative":
        return "neg"
    else:
        return numpy.nan


df = pandas.read_csv(germeval2017,
                     sep="\t",
                     on_bad_lines="skip",
                     names=["url", "text", "misc", "sentiment", "hashtag"])
df = df.drop(["misc", "hashtag", "url"], axis=1)
df['sentiment'] = df['sentiment'].swifter.apply(sentiment)
df.dropna(inplace=True)
df['text'] = df['text'].swifter.apply(lambda x: sanitize(x, True))
df.to_csv(filepath("clean_germeval2017.csv"),
          sep=";",
          index=False,
          mode='a',
          header=False)

df

Pandas Apply:   0%|          | 0/23525 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/7216 [00:00<?, ?it/s]

Unnamed: 0,text,sentiment
0,"[Re, Das, Erste, Ich, fahre, nicht, mit, der, ...",neg
2,"[TelMi, telmi, Laim, Fußgänger, von, S-Bahn, e...",neg
3,"[ , Hund, =, Fahrgast, ,, Hund, in, Box, =, G...",neg
4,"[Probleme, bei, der, S-Bahn, Lokführer, fehlen...",neg
5,"[ICE, Bahn, will, Gratis-WLAN, für, 2, ., Klas...",pos
...,...,...
23517,"[RT, , Ein, absoluter, Skandal, Damit, hat, s...",neg
23519,"[Locomore, -, Mehr, Bahn, ., -, , ..., klingt...",pos
23520,"[Schmierfilmbildung, auf, Schienen, ,, S-Bahn,...",neg
23521,"[Re, NPD, Landesverband, Bayern, Ja, richtig, ...",neg


In [None]:
!cp ./data/clean_germeval2017.csv '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

## 1.5. PotTS dataset

In [None]:
def sentiment(x):
    if x == "positive":
        return "pos"
    elif x == "negative":
        return "neg"
    else:
        return numpy.nan


df = pandas.read_csv(PotTS,
                     sep="\t",
                     on_bad_lines="skip",
                     names=["sentiment", "text"])
df = df.reindex(columns=["text", "sentiment"])
df['sentiment'] = df['sentiment'].swifter.apply(sentiment)
df.dropna(inplace=True)
df['text'] = df['text'].swifter.apply(lambda x: sanitize(x, True))
df.to_csv(filepath("clean_PotTS.csv"),
          sep=";",
          index=False,
          mode='a',
          header=False)
df

Pandas Apply:   0%|          | 0/7988 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/5014 [00:00<?, ?it/s]

Unnamed: 0,text,sentiment
0,"[ , für, die, , reicht, es, halt, noch, nich...",pos
1,"[RT, , ja, mach, das, mal, ich, will, unedi...",pos
2,"[Rating, -, Update, bei, Cache, me, if, you, c...",pos
3,"[Omg, ich, komm, von, der, Schule, und, sehe, ...",pos
4,"[ein, tag, voller, adrenalin, und, kalorien, )...",pos
...,...,...
7982,"[Sieht, ja, nich, mehr, so, frisch, aus, xO, p...",neg
7983,"[Wird, hier, hetzt, von, jedem, ein, papst, -,...",pos
7984,"[Irgendwie, bizzarr, ,, wenn, im, Stadion, New...",neg
7985,"[RT, , Welches, Gras, rauchen, eigentlich, a...",pos


In [None]:
!cp ./data/clean_PotTS.csv '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

## 1.6. SB10k dataset

In [None]:
def sentiment(x):
    if x == "positive":
        return "pos"
    elif x == "negative":
        return "neg"
    else:
        return numpy.nan


df = pandas.read_csv(SB10k,
                     sep="\t",
                     on_bad_lines="skip",
                     names=["sentiment", "text"])
df = df.reindex(columns=["text", "sentiment"])
df['sentiment'] = df['sentiment'].swifter.apply(sentiment)
df.dropna(inplace=True)
df['text'] = df['text'].swifter.apply(lambda x: sanitize(x, True))
df.to_csv(filepath("clean_SB10k.csv"),
          sep=";",
          index=False,
          mode='a',
          header=False)
df

Pandas Apply:   0%|          | 0/7453 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2838 [00:00<?, ?it/s]

Unnamed: 0,text,sentiment
0,"[ , ftv, ', a, kok, sedih, gn, sich, ', (]",neg
2,"[Toll, ,, die, 1, ., , DVD, von, Once, apon, ...",neg
4,"[Manche, Leute, können, echt, nicht, buchstabi...",neg
7,"[ , Wow, ., Glückwunsch, , Was, musstest, du,...",pos
10,"[ , oh, hahha]",pos
...,...,...
7430,"[ , Thx4, rosa, Empfehlung, ,, sehe, rot, ,, b...",neg
7435,"[Das, tvduell, hatte, kein, Champions, League,...",neg
7442,"[XBOCT, äusserst, sich, negativ, über, Mitspie...",neg
7444,"[ , oksip, ,, guten, nacht, nice, dream, {, }]",pos


In [None]:
!cp ./data/clean_SB10k.csv '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

## 1.7. create joint data set

In [None]:
base = "/content/drive/MyDrive/Colab Notebooks/rasa/data/"
filmstarts = base + "clean_filmstarts.csv"
holidaycheck = base + "clean_holidaycheck.csv"
germeval2017 = base + "clean_germeval2017.csv"
PotTS = base + "clean_PotTS.csv"
SB10k = base + "clean_SB10k.csv"
data_sets = [filmstarts, holidaycheck, germeval2017, PotTS, SB10k]
li = []
for file in data_sets:
    df = pandas.read_csv(file,
                         sep=";",
                         names=["text", "sentiment"],
                         index_col=None)
    li.append(df)
df = pandas.concat(li, axis=0, ignore_index=True)
df = df.iloc[1:, :]
print("positive: " + str(df[df.sentiment == "pos"].shape[0]))
print("negative: " + str(df[df.sentiment == "neg"].shape[0]))
print("total: " + str(df.shape[0]))


positive: 882433
negative: 860048
total: 1742481


In [None]:
df.to_json("data.json", orient='records')
!cp data.json '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

## 1.8. further preprocessing

In [None]:
import json
import gc
import stop_words
import spacy
from HanTa import HanoverTagger as ht

nlp = spacy.load('de')
# Opening JSON file
f = open('/content/drive/MyDrive/Colab Notebooks/rasa/data/data.json')
data = json.load(f)



In [None]:
stopwords = [w for w in stop_words.get_stop_words('de')]
tagger = ht.HanoverTagger('morphmodel_ger.pgz')


def lemmarize(string_list):
    lemmas = []
    for string in string_list:
        lemma = [
            lemma for (word, lemma, pos) in tagger.tag_sent(string.split())
        ]
        lemmas.append(' '.join(lemma))
    return lemmas


def str_to_list(x):
    return x.replace("[", "").replace("]", "").split(", ")


def text_preprocessing(string_list):
    res = []
    # lemmarize all stings
    string_list = lemmarize(string_list)
    for w in string_list:
        if w not in stopwords:
            # remove stopwords
            res.append(w)
    return res


with open('data_tokenized.json', 'w') as outfile:
    for dic in data:
        if dic["text"] != None:
            dic["text"] = str_to_list(str(dic["text"]))
            dic["text"] = text_preprocessing(dic["text"])
            json.dump(dic, outfile)
            outfile.write('\n')
        dic = None
        del dic


In [None]:
!cp data_tokenized.json '/content/drive/MyDrive/Colab Notebooks/rasa/data/'

# 2. Model

## 2.1. prepare model

In [None]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

In [None]:
import torch
import random
from torchtext.legacy import data
from torchtext import vocab

drive = "/content/drive/MyDrive/Colab Notebooks/rasa/data/"
TEXT = data.Field(sequential=True, preprocessing=generate_bigrams)
LABEL = data.Field(sequential=False,
                   dtype=torch.float,
                   is_target=True,
                   unk_token=None)
fields = {'text': ('text', TEXT), 'sentiment': ('label', LABEL)}


In [None]:
train_data, test_data = data.TabularDataset.splits(path=drive,
                                                   train="train.json",
                                                   test="test.json",
                                                   format='json',
                                                   fields=fields)
train_data, valid_data = train_data.split()

In [None]:
!wget https://int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt

In [None]:
vec = vocab.Vectors('vectors.txt', cache='./')

In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors=vec,
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [None]:
LABEL.vocab.stoi

defaultdict(None, {'neg': 1, 'pos': 0})

In [None]:
torch.save(TEXT.vocab, 'vocab.pt')

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=False,
    device=device)


In [None]:
import torch.nn as nn
import torch.nn.functional as F


class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.permute(1, 0, 2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        return self.fc(pooled)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = FastText(INPUT_DIM, EMBEDDING_DIM, OUTPUT_DIM, PAD_IDX)

In [None]:
print("INPUT_DIM: " + str(INPUT_DIM))
print("PAD_IDX: " + str(PAD_IDX))


INPUT_DIM: 25002
PAD_IDX: 1


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,500,901 trainable parameters


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.3785e-02, -3.4791e-02, -6.9000e-05,  ...,  1.4088e-01,
         -1.0836e-01,  4.0510e-03],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## 2.1. train model

In [None]:
import time
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 3m 57s
	Train Loss: 0.351 | Train Acc: 86.23%
	 Val. Loss: 1.449 |  Val. Acc: 87.52%
Epoch: 02 | Epoch Time: 3m 51s
	Train Loss: 0.316 | Train Acc: 87.83%
	 Val. Loss: 1.462 |  Val. Acc: 87.60%
Epoch: 03 | Epoch Time: 3m 55s
	Train Loss: 0.311 | Train Acc: 88.02%
	 Val. Loss: 1.518 |  Val. Acc: 87.58%
Epoch: 04 | Epoch Time: 3m 50s
	Train Loss: 0.309 | Train Acc: 88.11%
	 Val. Loss: 1.508 |  Val. Acc: 87.58%
Epoch: 05 | Epoch Time: 3m 52s
	Train Loss: 0.308 | Train Acc: 88.16%
	 Val. Loss: 1.535 |  Val. Acc: 87.59%


In [None]:
model.load_state_dict(torch.load('tut3-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.481 | Test Acc: 87.45%


## 2.2 run model

In [3]:
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
from HanTa import HanoverTagger as ht

nlp = spacy.load('de')


def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x


class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embedding_dim,
                                      padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.permute(1, 0, 2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        return self.fc(pooled)

In [27]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')


def lemmarize(string_list):
    lemmas = []
    for string in string_list:
        lemma = [
            lemma for (word, lemma, pos) in tagger.tag_sent(string.split())
        ]
        lemmas.append(' '.join(lemma))
    return lemmas


def predict_sentiment(model, vocab, sentence):
    model.eval()
    tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])
    lemmarized = lemmarize(tokenized)
    indexed = [vocab.stoi[t] for t in lemmarized]
    print(lemmarized)
    print(indexed)
    tensor = torch.LongTensor(indexed)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [7]:
drive = "/content/drive/MyDrive/Colab Notebooks/rasa/models/"
model = FastText(25002, 300, 1, 1)
model.load_state_dict(
    torch.load(drive + "fasttext-model.pt", map_location=torch.device('cpu')))
model.eval()
vocab = torch.load(drive + "fasttext-vocab-25000.pt")

In [28]:
string = "Bester Film seit Langem. Epischer Soundtrack und Charaktere, dazu unglaublich Bild gewaltig! Bitte diesen Film unbedingt anschauen, damit bald der zweite Teil kommt."
predict_sentiment(model, vocab, string)

[8, 74, 828, 335, 2, 0, 7617, 0, 2222, 2, 0, 1372, 842, 10014, 2, 664, 0, 74, 459, 1279, 2, 0, 2973, 0, 1712, 196, 60, 2, 10829, 0, 17455, 1450, 0, 0, 0, 0, 1722, 7501, 0, 0, 0, 0, 0, 0, 0, 0, 2013, 316, 24607, 0, 0, 0, 0, 0, 24428]
['gut', 'Film', 'seit', 'lang', '--', 'Epischer', 'Soundtrack', 'und', 'Charakter', '--', 'dazu', 'unglaublich', 'Bild', 'gewaltig', '--', 'Bitte', 'diesen', 'Film', 'unbedingt', 'anschauen', '--', 'damit', 'bald', 'der', 'Zweite', 'Teil', 'kommen', '--', 'seit lang', 'Film unbedingt', 'gut Film', '-- Bitte', 'Bild gewaltig', '-- dazu', 'bald der', 'Film seit', 'lang --', 'Charakter --', 'der zweiter', 'damit bald', 'Teil kommen', 'und Charakter', 'unglaublich Bild', '-- Epischer', '-- damit', 'unbedingt anschauen', 'anschauen --', 'kommen --', 'zweiter Teil', 'Epischer Soundtrack', 'bitte diesen', 'diesen Film', 'Soundtrack und', 'dazu unglaublich', 'gewaltig --']


2.6971842999046203e-06

## visualize model

In [None]:
!pip install torchviz

In [25]:
from torchviz import make_dot
x = torch.LongTensor([[2], [3], [2], [0]])
y = model(x)

make_dot(y.mean(), show_attrs=True, params=dict(list(model.named_parameters()))).render('fasttext_torchviz', format='png')


'fasttext_torchviz.png'