In [1]:
#-*- encoding: iso-8859-15 -*-
import numpy as np
from easydict import EasyDict as edict

import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import itertools

import heapq

# Import other python files


#### Configuration / parameters to set

In [2]:
def set_config(config_path = "config.txt", args = dict()):
    with open(config_path) as source:
        for line in source:
            line = line.strip()
            argLong, valueLong = line.split('=')
            arg = argLong.strip()
            value = valueLong.strip()
            if value == 'True':
                value = True
            elif value == 'False':
                value = False
            elif '.' in value:
                value = float(value)
            else:
                value = int(value)
            args[arg] = value
    return edict(args)

In [3]:
config_path = 'config.txt'
args = {}
args = set_config(config_path, args)
print(args)
#args.batch_size = 9

{'seq_len': 5, 'offset': 1, 'cuda': False, 'batch_size': 32, 'num_layers': 1, 'hidden_size': 30, 'lr': 0.001, 'clip': 1}


### Data Processing functions and classes

In [4]:
def prepare_text(textsource):
    text = ''
    with open(textsource, encoding="utf8") as txtsource:
        for line in txtsource:
            line = line.strip().lower()
            line = line.replace(',', '').replace('.', '')
            line = line.replace('»', '').replace('«', '')
            line = line.replace('"', '')
            line = line.replace(u'\ufeff', '')
            text += ' ' + line
    text = text[:2408] #### nachher wieder rauslöschen!!!
    return text
# Chevrons müssen noch weg

In [5]:
def prepare_data(text, seq_len, offset):
    # Get all the unique characters appearing in the text 
    chars = sorted(list(set(text)))
    char_idx = dict((c, i) for i, c in enumerate(chars))
    print('char_indices_map: ' + str(char_idx))
    print('len(char_indices_map): ' + str(len(char_idx)))
    idx_char = dict((i, c) for i, c in enumerate(chars)) #### das brauchen wir später!!!
    no_classes = len(chars) # the nr. of unique characters corresponds to the nr. of classes
    
    # count 'w' occurences, which is by far the least frequent with 1.6%
    wcount = 0
    for i in range(0, len(text) - seq_len, offset):
        if text[i + seq_len] == 'w':
            wcount += 1
    
    print('w-occurences: ' + str(wcount))
            
    # define counts and dict
    counts_dict = {' ': 0, '-': 1, '?': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h':10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'z': 27, 'ß': 28, 'ä': 29, 'ö': 30, 'ü': 31, '–': 32, '…': 33, '‹': 34, '›': 35}
    print('counts_dict: ' + str(counts_dict))
    counts = [0]*len(counts_dict)
    # Define training samples by splitting the text
    sentences = []
    next_chars = []
    for i in range(0, len(text) - seq_len, offset):
        if text[i + seq_len] == ' ':
            continue
        else:#if counts[counts_dict[text[i + seq_len]]] <= wcount:
            sentences.append(text[i: i + seq_len])
            next_chars.append(text[i + seq_len])
            #counts[counts_dict[text[i + seq_len]]] += 1

    #print('sentences', sentences)    
    #print('next_chars', next_chars)
    print('nr training samples', len(sentences))
    
    # Generate features and labels using one-hot encoding
    X = np.zeros((len(sentences), seq_len, len(chars)), dtype='f')
    y = np.zeros((len(sentences)))
    gt = np.zeros((len(sentences), len(chars)), dtype = 'f')
    
    for i, sentence in enumerate(sentences):
        for j, char in enumerate(sentence):
            X[i, j, char_idx[char]] = 1
        y[i] = char_idx[next_chars[i]]
        gt[i, char_idx[next_chars[i]]] = 1
        
    #print('next_chars: ' + str(next_chars[0]))
    #print('out: ' + str(X[0, :, :]))
    #print('target: ' + str(y[0]))
        
    return X, y, gt, char_idx, idx_char, no_classes

In [6]:
class TextDataset(Dataset):
    ''' A text dataset class which implements the abstract class torch.utils.data.Dataset. '''
    def __init__(self, text, seq_len, offset):
        self.data, self.target, self.gt, self.char_idx, self.idx_char, self.no_classes = prepare_data(text, seq_len, offset)
        
    def __getitem__(self, index):
        ''' Get the data for one training sample (by index) '''
        return self.data[index,:,:], self.target[index] 
    
    def __len__(self):
        ''' Get the number of training samples '''
        return self.data.shape[0]

### LSTM functions and classes

In [31]:
class LSTM_RNN(nn.Module):
    
    def __init__(self, no_classes):
        super(LSTM_RNN, self).__init__()
        
        self.lstm = nn.LSTM(input_size = no_classes, hidden_size = args.hidden_size, num_layers = args.num_layers)
        self.linear = nn.Linear(in_features = args.hidden_size, out_features = no_classes)
        self.softmax = nn.Softmax(dim=1)
        
        # self.linear.weight.data.normal_(0, 0.075**2)
        # self.linear.bias.data.normal_(0, 0.075**2)
        # for name, param in self.lstm.named_parameters():
        #    if 'bias' in name:
        #        nn.init.constant(param, 0.0)
        #    elif 'weight' in name:
        #        nn.init.xavier_normal(param) 
        #nn.init.xavier_uniform(self.lstm.weight_hh_l0)

        
        # LSTM needs hidden variable which is initialized in self.init_hidden(self)
        self.hidden = self.init_hidden()
        
        # for m in self.modules():
        #     if isinstance(m, nn.Linear):
        #         m.weight.data.normal_(0, 0.075*0.075)
        #         m.bias.data.normal_(0, 0.075*0.075)
                
    
    def init_hidden(self):
        h0 = Variable(torch.zeros(args.num_layers, args.batch_size, args.hidden_size))
        c0 = Variable(torch.zeros(args.num_layers, args.batch_size, args.hidden_size))
        return (h0,c0)#Variable(torch.zeros((args.num_layers, args.batch_size, args.hidden_size)))
    
    def forward(self, x, hidden):
        #x = x.type(torch.DoubleTensor)
        #print(x)
        lstm_out, hidden = self.lstm(x, hidden) # (h0, c0 are set to default values)
        #print(lstm_out)
        #print("LSTM_OUT:")
        #print(lstm_out)
        #lstm_out = lstm_out.view(-1, lstm_out.size(2))
        #print("----------------")
        #print(lstm_out)
        linear_out = self.linear(lstm_out[-1])
        #print("Linear_OUT:")
        #print(linear_out)
        #res = self.softmax(linear_out) # use only the output of the last layer of lstm
        return linear_out, hidden

In [8]:
# Training loop (one epoch)
def train(model, epoch):
    model.train()
    criterion = nn.CrossEntropyLoss() # use the cross-entropy loss
    total_loss = 0.0 # compute total loss over one epoch

    for batch_idx, (data, target) in enumerate(train_loader):
        #print(data)
        data = data.view(data.size(1), data.size(0), data.size(2))
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        
        hidden = model.init_hidden()
        optimizer.zero_grad()
        #print("data:")
        #print(data[0])
        output, hidden = model(data, hidden)
        #print("output:")
        #print(output[0])
        #print("target:")
        #print(target)
        loss = criterion(output, target.type(torch.LongTensor)) # check how far away the output is from the original data
        #print("loss:")
        #print(loss)
        loss.backward()#retain_graph=True)
        
        #torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.data[0]
        #print(total_loss)


    relative_loss = total_loss/float(len(train_loader))
    print('Relative loss over epoch %s: %s' %(epoch, relative_loss))#loss.data[0]))
    return relative_loss # return the relative loss for later analysis
            

In [9]:
# Prediction loop for ONE testdata tensor
def rnn_predict(model, testdata):
    ''' Note: testdata have to be submitted as a tensor'''
    testdata = torch.from_numpy(testdata)
    print("testdata:")
    print(testdata)
    model.eval()
    testdata = testdata.view(testdata.size(0), -1)
    if args.cuda:
        testdata = testdata.cuda()
    testdata = testdata.type(torch.FloatTensor)
    testdata = Variable(testdata)
    hidden = model.init_hidden()
    prediction = model(testdata, hidden)
    return prediction

### Other functions

In [10]:
''' Function that returns the largest factor of number that isn't the number itself '''
def lfactor(num):
    for i in range(num - 1, 0, -1): # go backwards from num - 1 to 1
        if num % i == 0:            # if a number divides evenly
            return i                # it's the largest factor

### Marvins test functions

In [11]:
# die funktion brauchen wir vllt gar nicht, je nachdem ob wir den test loader verwenden oder wie wir das auch immer machen
def prepare_input(text):
    X = np.zeros((args.seq_len, no_classes))  # array with one entry which have 20 lines, each 11 entrys
    for t, char in enumerate(text):
        X[t, char_idx[char]] = 1.
    return X

def sample(preds, top_n=1):
    print("test")
    preds = preds[-1].data.numpy()
    print(preds)
    print(preds.shape)
    #preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(len(preds), zip(preds, itertools.count()))


def predict_completion(model, text, topn=1):
    original_text = text
    generated = text
    completion = ''
    next_char = ''
    max_iterations = 1
    i = 0
    while next_char != ' ' and i < max_iterations:
        i += 1
        x = prepare_input(text)
        preds = rnn_predict(model, x)
        next_chars = sample(preds[0], top_n=topn)
        print('id, char: ' + str(next_chars[0][1]) + ', ' + str(idx_char[next_chars[0][1]]))
        text = text[1:] + idx_char[next_chars[0][1]]
        completion += idx_char[next_chars[0][1]]

    return completion


def predict_completions(model, text, n=3):
    x = prepare_input(text)
    preds = model.rnn_predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [idx_char[idx] + predict_completion(text[1:] + idx_char[idx]) for idx in next_indices]

### Main code

In [32]:
config_path = 'config.txt'
args = {}
args = set_config(config_path, args)
print(args)

# Generate train and test loader from our data
train_text = prepare_text('./Brown_Leseprobe.txt')
train_set = TextDataset(train_text, args.seq_len, args.offset)
#args.batch_size = lfactor(len(train_set))
train_loader = DataLoader(train_set, batch_size = args.batch_size, shuffle=False)
gt = train_set.gt

def dim(a):
    if not type(a) == list:
        return []
    return [len(a)] + dim(a[0])

# for i, val in enumerate(train_loader):
#     print(val)
#     if i==1:
#         break

test_text = prepare_text('./Brown_Leseprobe_test.txt')
test_set = TextDataset(test_text, args.seq_len, args.offset)
test_loader = DataLoader(test_set, batch_size = args.batch_size, shuffle=False)

# set further parameters
char_idx = train_set.char_idx
idx_char = train_set.idx_char
no_classes = train_set.no_classes
input_shape = (args.seq_len, no_classes) # seq_len * nr. of unique characters 

# get len of data to determine the possible batch_size
print(args.batch_size)


{'seq_len': 5, 'offset': 1, 'cuda': False, 'batch_size': 32, 'num_layers': 1, 'hidden_size': 30, 'lr': 0.001, 'clip': 1}
char_indices_map: {' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'z': 24, 'ß': 25, 'ä': 26, 'ö': 27, 'ü': 28, '–': 29}
len(char_indices_map): 30
w-occurences: 33
counts_dict: {' ': 0, '-': 1, '?': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'z': 27, 'ß': 28, 'ä': 29, 'ö': 30, 'ü': 31, '–': 32, '…': 33, '‹': 34, '›': 35}
nr training samples 2048


char_indices_map: {' ': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'z': 24, 'ß': 25, 'ä': 26, 'ö': 27, 'ü': 28, '–': 29}
len(char_indices_map): 30
w-occurences: 33
counts_dict: {' ': 0, '-': 1, '?': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'z': 27, 'ß': 28, 'ä': 29, 'ö': 30, 'ü': 31, '–': 32, '…': 33, '‹': 34, '›': 35}
nr training samples 2048
32


In [33]:
# Generate model
print(input_shape)
rnn = LSTM_RNN(no_classes)
if args.cuda:
    rnn.cuda()
print(rnn)

(5, 30)
LSTM_RNN(
  (lstm): LSTM(30, 30)
  (linear): Linear(in_features=30, out_features=30)
  (softmax): Softmax()
)


In [34]:
# Initialize the optimization algorithm
optimizer = optim.Adadelta(rnn.parameters())

In [30]:

# Run training and store history
history = dict()
history['loss_train'] = []
history['loss_test'] = []

# wie wir die accuracy machen, weiß ich noch nicht...
#history['acc_train'] = []
#history['acc_test'] = []
print(args.batch_size)
for epoch in range(100):
    loss_train = train(rnn, epoch)        
    history['loss_train'].append(loss_train)      

32


RuntimeError: The size of tensor a (30) must match the size of tensor b (90) at non-singleton dimension 2

In [204]:
# Try a prediction

#testdata = Variable(torch.from_numpy(test_set.data[0])) # get first element from the test set
#truth = test_set.target[0]
#print(testdata,truth)

#prediction = rnn(testdata)
## dann muss man hier noch auf die sizes achten, ach verdammt
#prepare_input("This is an example of input for our LSTM".lower(), train_set.data, char_idx)
#print(predict_completions(seq, 5))

In [798]:

testcases = ["zahnr", " währ", "mühsa", "winde", "erreg", "steil", "krall", "aufra", "spitz"]
for case in testcases:
    predict_completion(rnn, case.lower(), topn=3)

testdata:


Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     1     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     1     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     0     1     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0     0     0     0
    0     1     0     0     0     0     0     0     0     0     0     0     0
    0     0     0     0     1     0     0     0     0     0     0     0     0

Columns 26 to 29 
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
[torch.DoubleTensor of size 5x30]


(30,)
id, char: 5, e


In [None]:
print(train_set.no_classes)
print(char_idx)
print(args.batch_size)


29
{'ö': 26, 'c': 3, 'd': 4, 'j': 10, 'g': 7, 'p': 16, 'l': 12, 'e': 5, 'r': 17, 'b': 2, 't': 19, 'f': 6, 'ü': 27, ' ': 0, 'u': 20, 'z': 23, 'v': 21, 'k': 11, 'w': 22, 'ß': 24, 'o': 15, 'n': 14, 'ä': 25, 'i': 9, 'h': 8, 's': 18, '–': 28, 'm': 13, 'a': 1}
81
