In [1]:
import csv

from bpemb import BPEmb
from cleantext import clean
from fastai.callbacks import *
from fastai.imports import torch
from fastai.text import * 
import pandas as pd

torch.cuda.set_device(2)

In [2]:
bpemb_de = BPEmb(lang="de", vs=25000, dim=300)

# construct the vocabulary by added a padding token with the ID 25000 (because of the bpemb_de vocab size)
itos = dict(enumerate(bpemb_de.words + ['xxpad']))
voc = Vocab(itos)

def load_data(filename):
    texts = []
    labels = []
    with open(filename) as csvfile:
        # follow the 10kGNAD creator's setup
        reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
        for row in reader:
            labels.append(row[0])
            texts.append(row[1])
    df = pd.DataFrame({'label': labels, 'text': texts})
    df['text'] = df['text'].apply(lambda x: bpemb_de.encode_ids_with_bos_eos(clean(x, lang='de')))
    return df

df_train_valid = load_data("10kGNAD/train.csv")

# the last 1000 training samples are used for validation
df_train = df_train_valid.iloc[:-1000]
df_valid = df_train_valid.iloc[-1000:]

df_test = load_data("10kGNAD/test.csv")

In [24]:
data_lm = TextLMDataBunch.from_ids('uf_de_exp', bs=128, vocab=voc, train_ids=df_train['text'], valid_ids=df_valid['text'])

In [31]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
learn_lm.load('/mnt/data/group07/johannes/germanlm/exp_10/models/2019_ 4_14_20_48_17_552279')

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (8245 items)
x: LMTextList
<s> ▁00- jähriger ▁fällt ▁wohl ▁bis ▁saisonende ▁aus . ▁wien ▁- ▁rapid ▁muss ▁wohl ▁bis ▁saisonende ▁auf ▁offen siv spieler ▁thomas ▁mur g ▁verzichten . ▁der ▁im ▁winter ▁aus ▁ried ▁gekommen e ▁00- jährige ▁erlitt ▁beim ▁0:0- heim deb akel ▁gegen ▁ad m ira ▁wacker ▁mö dling ▁am ▁samstag ▁einen ▁teil riss ▁des ▁innen band es ▁im ▁linken ▁knie , ▁wie ▁eine ▁magnet res onanz - untersuchung ▁am ▁donner stag ▁ergab . ▁mur g ▁erhielt ▁eine ▁schien e , ▁muss ▁aber ▁nicht ▁oper iert ▁werden . ▁dennoch ▁steht ▁ihm ▁eine ▁mehr wöch ige ▁pause ▁bevor . </s>,<s> ▁erfunden e ▁bilder ▁zu ▁filmen , ▁die ▁als ▁verloren ▁gelten : ▁" the ▁for b id den ▁ro om " ▁von ▁guy ▁mad din ▁und ▁ev an ▁johnson ▁ist ▁ein ▁sur re aler ▁ritt ▁durch ▁die ▁mag ischen ▁lab yr in the ▁des ▁frühen ▁kinos . ▁wien ▁- ▁die ▁film geschichte ▁ist ▁ein ▁friedhof ▁der ▁verlorenen . ▁unter ▁den ▁begraben en ▁finden ▁sich ▁zahl lose ▁filme , ▁von ▁d

In [32]:
learn_lm.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.318447,3.778145,0.325662,01:59


In [33]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, 1e-3, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.119852,3.757536,0.328551,02:22
1,4.008326,3.735962,0.331885,02:23
2,3.93222,3.717673,0.334252,02:23
3,3.83606,3.705034,0.336779,02:22
4,3.772302,3.694369,0.337968,02:22
5,3.698188,3.685352,0.339441,02:22
6,3.656193,3.682763,0.339979,02:22
7,3.61389,3.682475,0.340089,02:22


In [34]:
learn_lm.save_encoder('enc')

In [3]:
classes = df_train['label'].unique().tolist()
for dfx in [df_train, df_valid, df_test]:
    dfx['label'] = dfx['label'].apply(lambda x: classes.index(x))

# NB: set the corrext padding idx
data_train = TextClasDataBunch.from_ids('uf_de_exp', pad_idx=25000, classes=classes, bs=32, vocab=voc, train_lbls=df_train['label'], train_ids=df_train['text'], valid_ids=df_valid['text'], valid_lbls=df_valid['label'])

# store the test dataset within another TextClasDataBunch
data_test = TextClasDataBunch.from_ids('uf_de_exp', pad_idx=25000, classes=classes, bs=32, vocab=voc, train_lbls=df_train['label'], train_ids=df_train['text'], valid_ids=df_test['text'], valid_lbls=df_test['label'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
learn = text_classifier_learner(data_train, AWD_LSTM, drop_mult=0.5)
learn.load_encoder('enc', device='cuda:2')

In [5]:
factor = 2.6
lr = 4e-3
lrs = [lr / (factor ** (4 - x)) for x in range(4)] + [lr]

In [6]:
learn.fit(1, lrs)

epoch,train_loss,valid_loss,accuracy,time
0,0.907762,0.482308,0.825,01:38


In [7]:
learn.freeze_to(-2)
learn.fit(1, lrs)

epoch,train_loss,valid_loss,accuracy,time
0,0.728629,0.407739,0.856,01:58


In [8]:
learn.callbacks += [
    SaveModelCallback(learn, name='best', monitor='accuracy'),
    EarlyStoppingCallback(learn, monitor='accuracy', patience=10),
]

In [9]:
learn.unfreeze()
learn.fit(100, lrs)

epoch,train_loss,valid_loss,accuracy,time
0,0.598921,0.433241,0.851,03:42
1,0.561212,0.394498,0.853,03:40
2,0.507523,0.367141,0.876,03:34
3,0.479151,0.362641,0.877,03:41
4,0.506904,0.359822,0.874,03:51
5,0.511931,0.34419,0.874,03:36
6,0.49965,0.378157,0.867,04:00
7,0.483655,0.340493,0.884,04:00
8,0.465735,0.341785,0.885,03:59
9,0.411083,0.344238,0.872,03:49


Better model found at epoch 0 with accuracy value: 0.8510000109672546.
Better model found at epoch 1 with accuracy value: 0.8529999852180481.
Better model found at epoch 2 with accuracy value: 0.8759999871253967.
Better model found at epoch 3 with accuracy value: 0.8769999742507935.
Better model found at epoch 7 with accuracy value: 0.8840000033378601.
Better model found at epoch 8 with accuracy value: 0.8849999904632568.
Better model found at epoch 11 with accuracy value: 0.8870000243186951.
Better model found at epoch 13 with accuracy value: 0.8880000114440918.
Better model found at epoch 15 with accuracy value: 0.8960000276565552.
Better model found at epoch 22 with accuracy value: 0.8970000147819519.
Better model found at epoch 24 with accuracy value: 0.9100000262260437.
Epoch 35: early stopping


In [13]:
learn.validate(data_test.valid_dl)

Epoch 0: early stopping


[0.48844776, tensor(0.8833)]